skops-dev · adrinjalali · May 15, 2023 · Apr 1, 2023 · Apr 1, 2023 · Apr 1, 2023
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -17,6 +17,8 @@ v0.7
   `Adrin Jalali`_.
 - Fix: :func:`skops.io.visualize` is now capable of showing bytes. :pr:`352` by
   `Benjamin Bossan`_.
+- All public ``numpy`` ufuncs (Universal Functions) and dtypes are trusted by default
+  by :func:`.io.load`. :pr:`336` by :user:`Omar Arab Oghli <omar-araboghli>`.
 
 v0.6
 ----

diff --git a/docs/persistence.rst b/docs/persistence.rst
@@ -82,7 +82,7 @@ using :func:`skops.io.get_untrusted_types`:
     from skops.io import get_untrusted_types
     unknown_types = get_untrusted_types(file="my-model.skops")
     print(unknown_types)
-    ['numpy.float64', 'numpy.int64', 'sklearn.metrics._scorer._passthrough_scorer',
+    ['sklearn.metrics._scorer._passthrough_scorer',
     'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier']
 
 Note that everything in the above list is safe to load. We already have many
@@ -108,7 +108,8 @@ At the moment, ``skops`` cannot persist arbitrary Python code. This means if
 you have custom functions (say, a custom function to be used with
 :class:`sklearn.preprocessing.FunctionTransformer`), it will not work. However,
 most ``numpy`` and ``scipy`` functions should work. Therefore, you can save
-objects having references to functions such as ``numpy.sqrt``.
+objects having references to functions or universal functions (ufuncs)
+such as ``numpy.sqrt``.
 
 Compression
 ~~~~~~~~~~~

diff --git a/skops/io/_general.py b/skops/io/_general.py
@@ -14,6 +14,8 @@
 from ._audit import Node, get_tree
 from ._protocol import PROTOCOL
 from ._trusted_types import (
+    NUMPY_DTYPE_TYPE_NAMES,
+    NUMPY_UFUNC_TYPE_NAMES,
     PRIMITIVE_TYPE_NAMES,
     SCIPY_UFUNC_TYPE_NAMES,
     SKLEARN_ESTIMATOR_TYPE_NAMES,
@@ -210,7 +212,9 @@ def __init__(
     ) -> None:
         super().__init__(state, load_context, trusted)
         # TODO: what do we trust?
-        self.trusted = self._get_trusted(trusted, default=SCIPY_UFUNC_TYPE_NAMES)
+        self.trusted = self._get_trusted(
+            trusted, default=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES
+        )
         self.children = {}
 
     def _construct(self):
@@ -293,7 +297,9 @@ def __init__(
     ) -> None:
         super().__init__(state, load_context, trusted)
         # TODO: what do we trust?
-        self.trusted = self._get_trusted(trusted, PRIMITIVE_TYPE_NAMES)
+        self.trusted = self._get_trusted(
+            trusted, PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES
+        )
         # We use a bare Node type here since a Node only checks the type in the
         # dict using __class__ and __module__ keys.
         self.children = {}

diff --git a/skops/io/_numpy.py b/skops/io/_numpy.py
@@ -8,6 +8,7 @@
 from ._audit import Node, get_tree
 from ._general import function_get_state
 from ._protocol import PROTOCOL
+from ._trusted_types import NUMPY_DTYPE_TYPE_NAMES
 from ._utils import LoadContext, SaveContext, get_module, get_state, gettype
 from .exceptions import UnsupportedTypeException
 
@@ -52,6 +53,10 @@ def ndarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
 
 
 class NdArrayNode(Node):
+    # TODO: NdArrayNode is not only responsible for np.arrays
+    #  but also for np.generics, thus the confusion with DTypeNode.
+    #  See PR-336
+
     def __init__(
         self,
         state: dict[str, Any],
@@ -60,7 +65,9 @@ def __init__(
     ) -> None:
         super().__init__(state, load_context, trusted)
         self.type = state["type"]
-        self.trusted = self._get_trusted(trusted, [np.ndarray])
+        self.trusted = self._get_trusted(
+            trusted, [np.ndarray] + NUMPY_DTYPE_TYPE_NAMES  # type: ignore
+        )
         if self.type == "numpy":
             self.children = {
                 "content": io.BytesIO(load_context.src.read(state["file"]))

diff --git a/skops/io/_trusted_types.py b/skops/io/_trusted_types.py
@@ -2,7 +2,7 @@
 import scipy
 from sklearn.utils import all_estimators
 
-from ._utils import get_type_name
+from ._utils import get_public_type_names, get_type_name
 
 PRIMITIVES_TYPES = [int, float, str, bool]
 
@@ -14,13 +14,15 @@
     if get_type_name(estimator_class).startswith("sklearn.")
 ]
 
-SCIPY_UFUNC_TYPE_NAMES = sorted(
-    set(
-        [
-            get_type_name(getattr(scipy.special, attr))
-            for attr in dir(scipy.special)
-            if isinstance(getattr(scipy.special, attr), np.ufunc)
-            and get_type_name(getattr(scipy.special, attr)).startswith("scipy")
-        ]
-    )
+SCIPY_UFUNC_TYPE_NAMES = get_public_type_names(module=scipy.special, oftype=np.ufunc)
+
+NUMPY_UFUNC_TYPE_NAMES = get_public_type_names(module=np, oftype=np.ufunc)
+
+NUMPY_DTYPE_TYPE_NAMES = sorted(
+    {
+        type_name
+        for dtypes in np.sctypes.values()
+        for dtype in dtypes  # type: ignore
+        if (type_name := get_type_name(dtype)).startswith("numpy")
+    }
 )
diff --git a/skops/io/_utils.py b/skops/io/_utils.py
@@ -4,6 +4,7 @@
 import sys
 from dataclasses import dataclass, field
 from functools import singledispatch
+from types import ModuleType
 from typing import Any, Type
 from zipfile import ZipFile
 
@@ -200,3 +201,36 @@ def get_type_paths(types: Any) -> list[str]:
         types = [types]
 
     return [get_type_name(t) if not isinstance(t, str) else t for t in types]
+
+
+def get_public_type_names(module: ModuleType, oftype: Type) -> list[str]:
+    """
+    Helper function that gets the type names of all
+    public objects of the given ``oftype`` from the given ``module``,
+    which start with the root module name.
+
+    Public objects are those that can be read via ``dir(...)``.
+
+    Parameters
+    ----------
+    module: ModuleType
+        Module under which the public objects are defined.
+    oftype: Type
+        The type of the objects.
+
+    Returns
+    ----------
+    type_names_list: list of str
+        The sorted list of type names, all as strings,
+         e.g. ``["numpy.core._multiarray_umath.absolute"]``.
+    """
+    module_name, _, _ = module.__name__.rpartition(".")
+
+    return sorted(
+        {
+            type_name
+            for attr in dir(module)
+            if issubclass((obj := getattr(module, attr)).__class__, oftype)
+            and (type_name := get_type_name(obj)).startswith(module_name)
+        }
+    )
diff --git a/skops/io/tests/test_audit.py b/skops/io/tests/test_audit.py
@@ -4,11 +4,8 @@
 from contextlib import suppress
 from zipfile import ZipFile
 
-import numpy as np
 import pytest
 from sklearn.linear_model import LogisticRegression
-from sklearn.pipeline import FeatureUnion, Pipeline
-from sklearn.preprocessing import FunctionTransformer, StandardScaler
 
 from skops.io import dumps, get_untrusted_types
 from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr
@@ -152,25 +149,6 @@ def __init__(self):
     assert not hasattr(temp, "b")
 
 
-def test_complex_pipeline_untrusted_set():
-    # fmt: off
-    clf = Pipeline([
-        ("features", FeatureUnion([
-            ("scaler", StandardScaler()),
-            ("sqrt", FunctionTransformer(
-                    func=np.sqrt,
-                    inverse_func=np.square,
-                )),
-        ])),
-        ("clf", LogisticRegression(random_state=0, solver="liblinear")),
-    ])
-    # fmt: on
-
-    untrusted = get_untrusted_types(data=dumps(clf))
-    type_names = [x.split(".")[-1] for x in untrusted]
-    assert type_names == ["sqrt", "square"]
-
-
 def test_format_object_node():
     estimator = LogisticRegression(random_state=0, solver="liblinear")
     state = get_state(estimator, SaveContext(None))

diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py
@@ -81,8 +81,6 @@ def trusted(self):
             "lightgbm.sklearn.LGBMClassifier",
             "lightgbm.sklearn.LGBMRegressor",
             "lightgbm.sklearn.LGBMRanker",
-            "numpy.int32",
-            "numpy.int64",
             "sklearn.preprocessing._label.LabelEncoder",
         ]
 
@@ -329,8 +327,6 @@ def trusted(self):
         # TODO: adjust once more types are trusted by default
         return [
             "builtins.bytes",
-            "numpy.float32",
-            "numpy.float64",
             "catboost.core.CatBoostClassifier",
             "catboost.core.CatBoostRegressor",
             "catboost.core.CatBoostRanker",

diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py
@@ -54,7 +54,13 @@
 from skops.io import dump, dumps, get_untrusted_types, load, loads
 from skops.io._audit import NODE_TYPE_MAPPING, get_tree
 from skops.io._sklearn import UNSUPPORTED_TYPES
-from skops.io._trusted_types import SCIPY_UFUNC_TYPE_NAMES, SKLEARN_ESTIMATOR_TYPE_NAMES
+from skops.io._trusted_types import (
+    NUMPY_DTYPE_TYPE_NAMES,
+    NUMPY_UFUNC_TYPE_NAMES,
+    PRIMITIVE_TYPE_NAMES,
+    SCIPY_UFUNC_TYPE_NAMES,
+    SKLEARN_ESTIMATOR_TYPE_NAMES,
+)
 from skops.io._utils import LoadContext, SaveContext, _get_state, get_state, gettype
 from skops.io.exceptions import UnsupportedTypeException, UntrustedTypesFoundException
 from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal
@@ -225,11 +231,17 @@ def _tested_estimators(type_filter=None):
 
 
 def _tested_ufuncs():
-    for full_name in SCIPY_UFUNC_TYPE_NAMES:
+    for full_name in SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES:
         module_name, _, ufunc_name = full_name.rpartition(".")
         yield gettype(module_name=module_name, cls_or_func=ufunc_name)
 
 
+def _tested_types():
+    for full_name in PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES:
+        module_name, _, type_name = full_name.rpartition(".")
+        yield gettype(module_name=module_name, cls_or_func=type_name)
+
+
 def _unsupported_estimators(type_filter=None):
     for name, Estimator in all_estimators(type_filter=type_filter):
         if Estimator not in UNSUPPORTED_TYPES:
@@ -359,15 +371,27 @@ def test_can_persist_fitted(estimator):
 
     assert not any(type_ in SKLEARN_ESTIMATOR_TYPE_NAMES for type_ in untrusted_types)
     assert not any(type_ in SCIPY_UFUNC_TYPE_NAMES for type_ in untrusted_types)
+    assert not any(type_ in NUMPY_UFUNC_TYPE_NAMES for type_ in untrusted_types)
+    assert not any(type_ in NUMPY_DTYPE_TYPE_NAMES for type_ in untrusted_types)
     assert_method_outputs_equal(estimator, loaded, X)
 
 
-@pytest.mark.parametrize("ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES)
+@pytest.mark.parametrize(
+    "ufunc", _tested_ufuncs(), ids=SCIPY_UFUNC_TYPE_NAMES + NUMPY_UFUNC_TYPE_NAMES
+)
 def test_can_trust_ufuncs(ufunc):
     dumped = dumps(ufunc)
     untrusted_types = get_untrusted_types(data=dumped)
     assert len(untrusted_types) == 0
-    # TODO: extend with numpy ufuncs
+
+
+@pytest.mark.parametrize(
+    "type_", _tested_types(), ids=PRIMITIVE_TYPE_NAMES + NUMPY_DTYPE_TYPE_NAMES
+)
+def test_can_trust_types(type_):
+    dumped = dumps(type_)
+    untrusted_types = get_untrusted_types(data=dumped)
+    assert len(untrusted_types) == 0
 
 
 @pytest.mark.parametrize(

diff --git a/skops/io/tests/test_visualize.py b/skops/io/tests/test_visualize.py
@@ -101,26 +101,19 @@ def sink(nodes_iter, *args, **kwargs):
         nodes_self_unsafe = [node for node in nodes if not node.is_self_safe]
         nodes_unsafe = [node for node in nodes if not node.is_safe]
 
-        # there are currently 2 unsafe nodes, a numpy int and the custom
-        # functions. The former might be considered safe in the future, in which
-        # case this test needs to be changed.
-        assert len(nodes_self_unsafe) == 2
-        assert nodes_self_unsafe[0].val == "numpy.int64"
-        assert nodes_self_unsafe[1].val == "test_visualize.unsafe_function"
+        assert len(nodes_self_unsafe) == 1
+        assert nodes_self_unsafe[0].val == "test_visualize.unsafe_function"
 
         # it's not easy to test the number of indirectly unsafe nodes, because
         # it will depend on the nesting structure; we can only be sure that it's
         # more than 2, and one of them should be the FunctionTransformer
         assert len(nodes_unsafe) > 2
         assert any("FunctionTransformer" in node.val for node in nodes_unsafe)
 
-    @pytest.mark.parametrize(
-        "trusted", [True, ["numpy.int64", "test_visualize.unsafe_function"]]
-    )
+    @pytest.mark.parametrize("trusted", [True, ["test_visualize.unsafe_function"]])
     def test_all_nodes_trusted(self, pipeline, trusted, capsys):
         # The pipeline contains untrusted type(s), but if we pass trusted=True,
         # it is not considered untrusted anymore
-        # TODO: remove numpy.int64 from trusted once it's trusted by default
         file = sio.dumps(pipeline)
         sio.visualize(file, show="untrusted", trusted=trusted)
         expected = "root: sklearn.pipeline.Pipeline"