skops-dev · adrinjalali · Apr 27, 2023 · Apr 24, 2023 · Apr 24, 2023 · Apr 25, 2023
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -79,6 +79,7 @@ jobs:
     - name: Tests
       env:
         SUPER_SECRET: ${{ secrets.HF_HUB_TOKEN }}
+        PYTHONIOENCODING: "utf-8"
       run: |
         python -m pytest -s -v --cov-report=xml -m "not inference" skops/
 

diff --git a/docs/changes.rst b/docs/changes.rst
@@ -15,6 +15,8 @@ v0.7
 - `compression` and `compresslevel` from :class:`~zipfile.ZipFile` are now
   exposed to the user via :func:`.io.dumps` and :func:`.io.dump`. :pr:`345` by
   `Adrin Jalali`_.
+- Fix: :func:`skops.io.visualize` is now capable of showing bytes. :pr:`352` by
+  `Benjamin Bossan`_.
 
 v0.6
 ----

diff --git a/skops/io/_general.py b/skops/io/_general.py
@@ -5,6 +5,7 @@
 import operator
 import uuid
 from functools import partial
+from reprlib import Repr
 from types import FunctionType, MethodType
 from typing import Any, Sequence
 
@@ -27,6 +28,9 @@
 )
 from .exceptions import UnsupportedTypeException
 
+arepr = Repr()
+arepr.maxstring = 24
+
 
 def dict_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
     res = {
@@ -527,6 +531,11 @@ def _construct(self):
         content = self.children["content"].getvalue()
         return content
 
+    def format(self):
+        content = self.children["content"].getvalue()
+        byte_repr = arepr.repr(content)
+        return byte_repr
+
 
 class BytearrayNode(BytesNode):
     def __init__(
@@ -543,6 +552,9 @@ def _construct(self):
         content_bytearray = bytearray(list(content_bytes))
         return content_bytearray
 
+    def format(self):
+        return f"bytearray({super().format()})"
+
 
 def operator_func_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
     _, attrs = obj.__reduce__()

diff --git a/skops/io/_visualize.py b/skops/io/_visualize.py
@@ -8,11 +8,21 @@
 from zipfile import ZipFile
 
 from ._audit import VALID_NODE_CHILD_TYPES, Node, get_tree
-from ._general import FunctionNode, JsonNode, ListNode
+from ._general import BytearrayNode, BytesNode, FunctionNode, JsonNode, ListNode
 from ._numpy import NdArrayNode
 from ._scipy import SparseMatrixNode
 from ._utils import LoadContext
 
+# The children of these types are not visualized
+SKIPPED_TYPES = (
+    BytearrayNode,
+    BytesNode,
+    FunctionNode,
+    JsonNode,
+    NdArrayNode,
+    SparseMatrixNode,
+)
+
 
 @dataclass
 class NodeInfo:
@@ -269,7 +279,7 @@ def walk_tree(
     # TODO: For better security, we should check the schema if we return early,
     # otherwise something nefarious could be hidden inside (however, if there
     # is, the node should be marked as unsafe)
-    if isinstance(node, (NdArrayNode, SparseMatrixNode, FunctionNode, JsonNode)):
+    if isinstance(node, SKIPPED_TYPES):
         return
 
     yield from walk_tree(

diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py
@@ -2,12 +2,22 @@
 
 Packages that are not builtins, standard lib, numpy, scipy, or scikit-learn.
 
+Testing:
+
+- persistence of unfitted models
+- persistence of fitted models
+- visualization of dumped models
+
+with a range of hyperparameters.
+
 """
 
+from unittest.mock import Mock, patch
+
 import pytest
 from sklearn.datasets import make_classification, make_regression
 
-from skops.io import dumps, loads
+from skops.io import dumps, loads, visualize
 from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal
 
 # Default settings for generated data
@@ -49,6 +59,14 @@ def rank_data(clf_data):
 class TestLightGBM:
     """Tests for LGBMClassifier, LGBMRegressor, LGBMRanker"""
 
+    @pytest.fixture(autouse=True)
+    def capture_stdout(self):
+        # Mock print and rich.print so that running these tests with pytest -s
+        # does not spam stdout. Other, more common methods of suppressing
+        # printing to stdout don't seem to work, perhaps because of pytest.
+        with patch("builtins.print", Mock()), patch("rich.print", Mock()):
+            yield
+
     @pytest.fixture(autouse=True)
     def lgbm(self):
         lgbm = pytest.importorskip("lightgbm")
@@ -83,9 +101,12 @@ def test_classifier(self, lgbm, clf_data, trusted, boosting_type):
 
         X, y = clf_data
         estimator.fit(X, y)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
     @pytest.mark.parametrize("boosting_type", boosting_types)
     def test_regressor(self, lgbm, regr_data, trusted, boosting_type):
         kw = {}
@@ -99,9 +120,12 @@ def test_regressor(self, lgbm, regr_data, trusted, boosting_type):
 
         X, y = regr_data
         estimator.fit(X, y)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
     @pytest.mark.parametrize("boosting_type", boosting_types)
     def test_ranker(self, lgbm, rank_data, trusted, boosting_type):
         kw = {}
@@ -115,9 +139,12 @@ def test_ranker(self, lgbm, rank_data, trusted, boosting_type):
 
         X, y, group = rank_data
         estimator.fit(X, y, group=group)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
 
 class TestXGBoost:
     """Tests for XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor, XGBRanker
@@ -136,6 +163,14 @@ class TestXGBoost:
 
     """
 
+    @pytest.fixture(autouse=True)
+    def capture_stdout(self):
+        # Mock print and rich.print so that running these tests with pytest -s
+        # does not spam stdout. Other, more common methods of suppressing
+        # printing to stdout don't seem to work, perhaps because of pytest.
+        with patch("builtins.print", Mock()), patch("rich.print", Mock()):
+            yield
+
     @pytest.fixture(autouse=True)
     def xgboost(self):
         xgboost = pytest.importorskip("xgboost")
@@ -170,9 +205,12 @@ def test_classifier(self, xgboost, clf_data, trusted, booster, tree_method):
 
         X, y = clf_data
         estimator.fit(X, y)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
     @pytest.mark.parametrize("booster", boosters)
     @pytest.mark.parametrize("tree_method", tree_methods)
     def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method):
@@ -186,9 +224,12 @@ def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method):
 
         X, y = regr_data
         estimator.fit(X, y)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
     @pytest.mark.parametrize("booster", boosters)
     @pytest.mark.parametrize("tree_method", tree_methods)
     def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method):
@@ -202,9 +243,12 @@ def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method):
 
         X, y = clf_data
         estimator.fit(X, y)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
     @pytest.mark.parametrize("booster", boosters)
     @pytest.mark.parametrize("tree_method", tree_methods)
     def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method):
@@ -218,9 +262,12 @@ def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method):
 
         X, y = regr_data
         estimator.fit(X, y)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
     @pytest.mark.parametrize("booster", boosters)
     @pytest.mark.parametrize("tree_method", tree_methods)
     def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method):
@@ -234,13 +281,24 @@ def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method):
 
         X, y, group = rank_data
         estimator.fit(X, y, group=group)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
 
 class TestCatboost:
     """Tests for CatBoostClassifier, CatBoostRegressor, and CatBoostRanker"""
 
+    @pytest.fixture(autouse=True)
+    def capture_stdout(self):
+        # Mock print and rich.print so that running these tests with pytest -s
+        # does not spam stdout. Other, more common methods of suppressing
+        # printing to stdout don't seem to work, perhaps because of pytest.
+        with patch("builtins.print", Mock()), patch("rich.print", Mock()):
+            yield
+
     # CatBoost data is a little different so that it works as categorical data
     @pytest.fixture(scope="module")
     def cb_clf_data(self, clf_data):
@@ -290,9 +348,12 @@ def test_classifier(self, catboost, cb_clf_data, trusted, boosting_type):
 
         X, y = cb_clf_data
         estimator.fit(X, y, cat_features=[0, 1])
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
     @pytest.mark.parametrize("boosting_type", boosting_types)
     def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type):
         estimator = catboost.CatBoostRegressor(
@@ -303,9 +364,12 @@ def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type):
 
         X, y = cb_regr_data
         estimator.fit(X, y, cat_features=[0, 1])
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
 
+        visualize(dumped, trusted=trusted)
+
     @pytest.mark.parametrize("boosting_type", boosting_types)
     def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type):
         estimator = catboost.CatBoostRanker(
@@ -316,5 +380,8 @@ def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type):
 
         X, y, group_id = cb_rank_data
         estimator.fit(X, y, cat_features=[0, 1], group_id=group_id)
-        loaded = loads(dumps(estimator), trusted=trusted)
+        dumped = dumps(estimator)
+        loaded = loads(dumped, trusted=trusted)
         assert_method_outputs_equal(estimator, loaded, X)
+
+        visualize(dumped, trusted=trusted)
diff --git a/skops/io/tests/test_visualize.py b/skops/io/tests/test_visualize.py
@@ -269,3 +269,23 @@ def test_from_file(self, simple, tmp_path, capsys):
         ]
         stdout, _ = capsys.readouterr()
         assert stdout.strip() == "\n".join(expected)
+
+    def test_long_bytes(self, capsys):
+        obj = {
+            "short_byte": b"abc",
+            "long_byte": b"010203040506070809101112131415",
+            "short_bytearray": bytearray(b"abc"),
+            "long_bytearray": bytearray(b"010203040506070809101112131415"),
+        }
+        dumped = sio.dumps(obj)
+        sio.visualize(dumped)
+
+        expected = [
+            "root: builtins.dict",
+            "├── short_byte: b'abc'",
+            "├── long_byte: b'01020304050...9101112131415'",
+            "├── short_bytearray: bytearray(b'abc')",
+            "└── long_bytearray: bytearray(b'01020304050...9101112131415')",
+        ]
+        stdout, _ = capsys.readouterr()
+        assert stdout.strip() == "\n".join(expected)