diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 1dfa0e30..abb9438b 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -79,6 +79,7 @@ jobs: - name: Tests env: SUPER_SECRET: ${{ secrets.HF_HUB_TOKEN }} + PYTHONIOENCODING: "utf-8" run: | python -m pytest -s -v --cov-report=xml -m "not inference" skops/ diff --git a/docs/changes.rst b/docs/changes.rst index 295b876b..f70dcaaf 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -15,6 +15,8 @@ v0.7 - `compression` and `compresslevel` from :class:`~zipfile.ZipFile` are now exposed to the user via :func:`.io.dumps` and :func:`.io.dump`. :pr:`345` by `Adrin Jalali`_. +- Fix: :func:`skops.io.visualize` is now capable of showing bytes. :pr:`352` by + `Benjamin Bossan`_. v0.6 ---- diff --git a/skops/io/_general.py b/skops/io/_general.py index b6b00d11..0902857c 100644 --- a/skops/io/_general.py +++ b/skops/io/_general.py @@ -5,6 +5,7 @@ import operator import uuid from functools import partial +from reprlib import Repr from types import FunctionType, MethodType from typing import Any, Sequence @@ -27,6 +28,9 @@ ) from .exceptions import UnsupportedTypeException +arepr = Repr() +arepr.maxstring = 24 + def dict_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: res = { @@ -527,6 +531,11 @@ def _construct(self): content = self.children["content"].getvalue() return content + def format(self): + content = self.children["content"].getvalue() + byte_repr = arepr.repr(content) + return byte_repr + class BytearrayNode(BytesNode): def __init__( @@ -543,6 +552,9 @@ def _construct(self): content_bytearray = bytearray(list(content_bytes)) return content_bytearray + def format(self): + return f"bytearray({super().format()})" + def operator_func_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: _, attrs = obj.__reduce__() diff --git a/skops/io/_visualize.py b/skops/io/_visualize.py index 1aa3dca2..9df80928 100644 --- a/skops/io/_visualize.py +++ b/skops/io/_visualize.py @@ -8,11 +8,21 @@ from zipfile import ZipFile from ._audit import VALID_NODE_CHILD_TYPES, Node, get_tree -from ._general import FunctionNode, JsonNode, ListNode +from ._general import BytearrayNode, BytesNode, FunctionNode, JsonNode, ListNode from ._numpy import NdArrayNode from ._scipy import SparseMatrixNode from ._utils import LoadContext +# The children of these types are not visualized +SKIPPED_TYPES = ( + BytearrayNode, + BytesNode, + FunctionNode, + JsonNode, + NdArrayNode, + SparseMatrixNode, +) + @dataclass class NodeInfo: @@ -269,7 +279,7 @@ def walk_tree( # TODO: For better security, we should check the schema if we return early, # otherwise something nefarious could be hidden inside (however, if there # is, the node should be marked as unsafe) - if isinstance(node, (NdArrayNode, SparseMatrixNode, FunctionNode, JsonNode)): + if isinstance(node, SKIPPED_TYPES): return yield from walk_tree( diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index a8e6038f..b41253c0 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -2,12 +2,22 @@ Packages that are not builtins, standard lib, numpy, scipy, or scikit-learn. +Testing: + +- persistence of unfitted models +- persistence of fitted models +- visualization of dumped models + +with a range of hyperparameters. + """ +from unittest.mock import Mock, patch + import pytest from sklearn.datasets import make_classification, make_regression -from skops.io import dumps, loads +from skops.io import dumps, loads, visualize from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal # Default settings for generated data @@ -49,6 +59,14 @@ def rank_data(clf_data): class TestLightGBM: """Tests for LGBMClassifier, LGBMRegressor, LGBMRanker""" + @pytest.fixture(autouse=True) + def capture_stdout(self): + # Mock print and rich.print so that running these tests with pytest -s + # does not spam stdout. Other, more common methods of suppressing + # printing to stdout don't seem to work, perhaps because of pytest. + with patch("builtins.print", Mock()), patch("rich.print", Mock()): + yield + @pytest.fixture(autouse=True) def lgbm(self): lgbm = pytest.importorskip("lightgbm") @@ -83,9 +101,12 @@ def test_classifier(self, lgbm, clf_data, trusted, boosting_type): X, y = clf_data estimator.fit(X, y) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + @pytest.mark.parametrize("boosting_type", boosting_types) def test_regressor(self, lgbm, regr_data, trusted, boosting_type): kw = {} @@ -99,9 +120,12 @@ def test_regressor(self, lgbm, regr_data, trusted, boosting_type): X, y = regr_data estimator.fit(X, y) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + @pytest.mark.parametrize("boosting_type", boosting_types) def test_ranker(self, lgbm, rank_data, trusted, boosting_type): kw = {} @@ -115,9 +139,12 @@ def test_ranker(self, lgbm, rank_data, trusted, boosting_type): X, y, group = rank_data estimator.fit(X, y, group=group) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + class TestXGBoost: """Tests for XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor, XGBRanker @@ -136,6 +163,14 @@ class TestXGBoost: """ + @pytest.fixture(autouse=True) + def capture_stdout(self): + # Mock print and rich.print so that running these tests with pytest -s + # does not spam stdout. Other, more common methods of suppressing + # printing to stdout don't seem to work, perhaps because of pytest. + with patch("builtins.print", Mock()), patch("rich.print", Mock()): + yield + @pytest.fixture(autouse=True) def xgboost(self): xgboost = pytest.importorskip("xgboost") @@ -170,9 +205,12 @@ def test_classifier(self, xgboost, clf_data, trusted, booster, tree_method): X, y = clf_data estimator.fit(X, y) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method): @@ -186,9 +224,12 @@ def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method): X, y = regr_data estimator.fit(X, y) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method): @@ -202,9 +243,12 @@ def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method): X, y = clf_data estimator.fit(X, y) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method): @@ -218,9 +262,12 @@ def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method): X, y = regr_data estimator.fit(X, y) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method): @@ -234,13 +281,24 @@ def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method): X, y, group = rank_data estimator.fit(X, y, group=group) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + class TestCatboost: """Tests for CatBoostClassifier, CatBoostRegressor, and CatBoostRanker""" + @pytest.fixture(autouse=True) + def capture_stdout(self): + # Mock print and rich.print so that running these tests with pytest -s + # does not spam stdout. Other, more common methods of suppressing + # printing to stdout don't seem to work, perhaps because of pytest. + with patch("builtins.print", Mock()), patch("rich.print", Mock()): + yield + # CatBoost data is a little different so that it works as categorical data @pytest.fixture(scope="module") def cb_clf_data(self, clf_data): @@ -290,9 +348,12 @@ def test_classifier(self, catboost, cb_clf_data, trusted, boosting_type): X, y = cb_clf_data estimator.fit(X, y, cat_features=[0, 1]) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + @pytest.mark.parametrize("boosting_type", boosting_types) def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type): estimator = catboost.CatBoostRegressor( @@ -303,9 +364,12 @@ def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type): X, y = cb_regr_data estimator.fit(X, y, cat_features=[0, 1]) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + visualize(dumped, trusted=trusted) + @pytest.mark.parametrize("boosting_type", boosting_types) def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type): estimator = catboost.CatBoostRanker( @@ -316,5 +380,8 @@ def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type): X, y, group_id = cb_rank_data estimator.fit(X, y, cat_features=[0, 1], group_id=group_id) - loaded = loads(dumps(estimator), trusted=trusted) + dumped = dumps(estimator) + loaded = loads(dumped, trusted=trusted) assert_method_outputs_equal(estimator, loaded, X) + + visualize(dumped, trusted=trusted) diff --git a/skops/io/tests/test_visualize.py b/skops/io/tests/test_visualize.py index 33a00436..77d2fd7d 100644 --- a/skops/io/tests/test_visualize.py +++ b/skops/io/tests/test_visualize.py @@ -269,3 +269,23 @@ def test_from_file(self, simple, tmp_path, capsys): ] stdout, _ = capsys.readouterr() assert stdout.strip() == "\n".join(expected) + + def test_long_bytes(self, capsys): + obj = { + "short_byte": b"abc", + "long_byte": b"010203040506070809101112131415", + "short_bytearray": bytearray(b"abc"), + "long_bytearray": bytearray(b"010203040506070809101112131415"), + } + dumped = sio.dumps(obj) + sio.visualize(dumped) + + expected = [ + "root: builtins.dict", + "├── short_byte: b'abc'", + "├── long_byte: b'01020304050...9101112131415'", + "├── short_bytearray: bytearray(b'abc')", + "└── long_bytearray: bytearray(b'01020304050...9101112131415')", + ] + stdout, _ = capsys.readouterr() + assert stdout.strip() == "\n".join(expected)