Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ jobs:
- name: Tests
env:
SUPER_SECRET: ${{ secrets.HF_HUB_TOKEN }}
PYTHONIOENCODING: "utf-8"
run: |
python -m pytest -s -v --cov-report=xml -m "not inference" skops/

Expand Down
2 changes: 2 additions & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ v0.7
- `compression` and `compresslevel` from :class:`~zipfile.ZipFile` are now
exposed to the user via :func:`.io.dumps` and :func:`.io.dump`. :pr:`345` by
`Adrin Jalali`_.
- Fix: :func:`skops.io.visualize` is now capable of showing bytes. :pr:`352` by
`Benjamin Bossan`_.

v0.6
----
Expand Down
12 changes: 12 additions & 0 deletions skops/io/_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import operator
import uuid
from functools import partial
from reprlib import Repr
from types import FunctionType, MethodType
from typing import Any, Sequence

Expand All @@ -27,6 +28,9 @@
)
from .exceptions import UnsupportedTypeException

arepr = Repr()
arepr.maxstring = 24


def dict_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
res = {
Expand Down Expand Up @@ -527,6 +531,11 @@ def _construct(self):
content = self.children["content"].getvalue()
return content

def format(self):
content = self.children["content"].getvalue()
byte_repr = arepr.repr(content)
return byte_repr


class BytearrayNode(BytesNode):
def __init__(
Expand All @@ -543,6 +552,9 @@ def _construct(self):
content_bytearray = bytearray(list(content_bytes))
return content_bytearray

def format(self):
return f"bytearray({super().format()})"


def operator_func_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
_, attrs = obj.__reduce__()
Expand Down
14 changes: 12 additions & 2 deletions skops/io/_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,21 @@
from zipfile import ZipFile

from ._audit import VALID_NODE_CHILD_TYPES, Node, get_tree
from ._general import FunctionNode, JsonNode, ListNode
from ._general import BytearrayNode, BytesNode, FunctionNode, JsonNode, ListNode
from ._numpy import NdArrayNode
from ._scipy import SparseMatrixNode
from ._utils import LoadContext

# The children of these types are not visualized
SKIPPED_TYPES = (
BytearrayNode,
BytesNode,
Comment thread
adrinjalali marked this conversation as resolved.
FunctionNode,
JsonNode,
NdArrayNode,
SparseMatrixNode,
)


@dataclass
class NodeInfo:
Expand Down Expand Up @@ -269,7 +279,7 @@ def walk_tree(
# TODO: For better security, we should check the schema if we return early,
# otherwise something nefarious could be hidden inside (however, if there
# is, the node should be marked as unsafe)
if isinstance(node, (NdArrayNode, SparseMatrixNode, FunctionNode, JsonNode)):
if isinstance(node, SKIPPED_TYPES):
return

yield from walk_tree(
Expand Down
91 changes: 79 additions & 12 deletions skops/io/tests/test_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,22 @@

Packages that are not builtins, standard lib, numpy, scipy, or scikit-learn.

Testing:

- persistence of unfitted models
- persistence of fitted models
- visualization of dumped models

with a range of hyperparameters.

"""

from unittest.mock import Mock, patch

import pytest
from sklearn.datasets import make_classification, make_regression

from skops.io import dumps, loads
from skops.io import dumps, loads, visualize
from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal

# Default settings for generated data
Expand Down Expand Up @@ -49,6 +59,14 @@ def rank_data(clf_data):
class TestLightGBM:
"""Tests for LGBMClassifier, LGBMRegressor, LGBMRanker"""

@pytest.fixture(autouse=True)
def capture_stdout(self):
# Mock print and rich.print so that running these tests with pytest -s
# does not spam stdout. Other, more common methods of suppressing
# printing to stdout don't seem to work, perhaps because of pytest.
with patch("builtins.print", Mock()), patch("rich.print", Mock()):
yield

@pytest.fixture(autouse=True)
def lgbm(self):
lgbm = pytest.importorskip("lightgbm")
Expand Down Expand Up @@ -83,9 +101,12 @@ def test_classifier(self, lgbm, clf_data, trusted, boosting_type):

X, y = clf_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)

@pytest.mark.parametrize("boosting_type", boosting_types)
def test_regressor(self, lgbm, regr_data, trusted, boosting_type):
kw = {}
Expand All @@ -99,9 +120,12 @@ def test_regressor(self, lgbm, regr_data, trusted, boosting_type):

X, y = regr_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)

@pytest.mark.parametrize("boosting_type", boosting_types)
def test_ranker(self, lgbm, rank_data, trusted, boosting_type):
kw = {}
Expand All @@ -115,9 +139,12 @@ def test_ranker(self, lgbm, rank_data, trusted, boosting_type):

X, y, group = rank_data
estimator.fit(X, y, group=group)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)


class TestXGBoost:
"""Tests for XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor, XGBRanker
Expand All @@ -136,6 +163,14 @@ class TestXGBoost:

"""

@pytest.fixture(autouse=True)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TIL

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And TIL about the redirect_stdout context manager but it doesn't work here :D (I guess because of pytest)

def capture_stdout(self):
# Mock print and rich.print so that running these tests with pytest -s
# does not spam stdout. Other, more common methods of suppressing
# printing to stdout don't seem to work, perhaps because of pytest.
with patch("builtins.print", Mock()), patch("rich.print", Mock()):
yield

@pytest.fixture(autouse=True)
def xgboost(self):
xgboost = pytest.importorskip("xgboost")
Expand Down Expand Up @@ -170,9 +205,12 @@ def test_classifier(self, xgboost, clf_data, trusted, booster, tree_method):

X, y = clf_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)

@pytest.mark.parametrize("booster", boosters)
@pytest.mark.parametrize("tree_method", tree_methods)
def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method):
Expand All @@ -186,9 +224,12 @@ def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method):

X, y = regr_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)

@pytest.mark.parametrize("booster", boosters)
@pytest.mark.parametrize("tree_method", tree_methods)
def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method):
Expand All @@ -202,9 +243,12 @@ def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method):

X, y = clf_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)

@pytest.mark.parametrize("booster", boosters)
@pytest.mark.parametrize("tree_method", tree_methods)
def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method):
Expand All @@ -218,9 +262,12 @@ def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method):

X, y = regr_data
estimator.fit(X, y)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)

@pytest.mark.parametrize("booster", boosters)
@pytest.mark.parametrize("tree_method", tree_methods)
def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method):
Expand All @@ -234,13 +281,24 @@ def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method):

X, y, group = rank_data
estimator.fit(X, y, group=group)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)


class TestCatboost:
"""Tests for CatBoostClassifier, CatBoostRegressor, and CatBoostRanker"""

@pytest.fixture(autouse=True)
def capture_stdout(self):
# Mock print and rich.print so that running these tests with pytest -s
# does not spam stdout. Other, more common methods of suppressing
# printing to stdout don't seem to work, perhaps because of pytest.
with patch("builtins.print", Mock()), patch("rich.print", Mock()):
yield

# CatBoost data is a little different so that it works as categorical data
@pytest.fixture(scope="module")
def cb_clf_data(self, clf_data):
Expand Down Expand Up @@ -290,9 +348,12 @@ def test_classifier(self, catboost, cb_clf_data, trusted, boosting_type):

X, y = cb_clf_data
estimator.fit(X, y, cat_features=[0, 1])
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)

@pytest.mark.parametrize("boosting_type", boosting_types)
def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type):
estimator = catboost.CatBoostRegressor(
Expand All @@ -303,9 +364,12 @@ def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type):

X, y = cb_regr_data
estimator.fit(X, y, cat_features=[0, 1])
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are going to print a bunch of stuff in the terminal when we run tests, shouldn't we be capturing the output? It also makes me realize, maybe we want to have an output arg for visualize?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are going to print a bunch of stuff in the terminal when we run tests

pytest automatically captures stdout unless used with the -s option, no?

maybe we want to have an output arg for visualize?

What would that argument do? We have the sink argument, in case someone wants to push the output to a logger, for instance.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sink is not trivial to implement, it'd be nice if one could pass something like os.devnull and the output would be forwarded there.

I know by default pytest captures output, but even when we pass -s right now, there's almost no output, and this adds quite a bit there. If you think it's not worth fixing, I'm happy to merge as is.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sink is not trivial to implement, it'd be nice if one could pass something like os.devnull and the output would be forwarded there.

You should still be able to change sys.stdout, right?

I know by default pytest captures output, but even when we pass -s right now, there's almost no output, and this adds quite a bit there.

I see. For test_external.py, I made a change to not print anything.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CI is green, please review again


@pytest.mark.parametrize("boosting_type", boosting_types)
def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type):
estimator = catboost.CatBoostRanker(
Expand All @@ -316,5 +380,8 @@ def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type):

X, y, group_id = cb_rank_data
estimator.fit(X, y, cat_features=[0, 1], group_id=group_id)
loaded = loads(dumps(estimator), trusted=trusted)
dumped = dumps(estimator)
loaded = loads(dumped, trusted=trusted)
assert_method_outputs_equal(estimator, loaded, X)

visualize(dumped, trusted=trusted)
20 changes: 20 additions & 0 deletions skops/io/tests/test_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,3 +269,23 @@ def test_from_file(self, simple, tmp_path, capsys):
]
stdout, _ = capsys.readouterr()
assert stdout.strip() == "\n".join(expected)

def test_long_bytes(self, capsys):
obj = {
"short_byte": b"abc",
"long_byte": b"010203040506070809101112131415",
"short_bytearray": bytearray(b"abc"),
"long_bytearray": bytearray(b"010203040506070809101112131415"),
}
dumped = sio.dumps(obj)
sio.visualize(dumped)

expected = [
"root: builtins.dict",
"├── short_byte: b'abc'",
"├── long_byte: b'01020304050...9101112131415'",
"├── short_bytearray: bytearray(b'abc')",
"└── long_bytearray: bytearray(b'01020304050...9101112131415')",
]
stdout, _ = capsys.readouterr()
assert stdout.strip() == "\n".join(expected)