From 6654d9c0e5b89c71266fd4e64a382d0a0f2d0a45 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 8 Dec 2022 16:52:14 +0100 Subject: [PATCH 01/12] Add support for LightGBM and XGBoost Partially solves #226 Description LightGBM already worked out of the box, so for this, only tests were added. For XGBoost, extra code was added to support it. Implementation Dependencies were extended to include xgboost and lightgbm for testing, so that they should be covered by CI. If those libraries are not installed, the tests should be gracefully skipped. For XGBoost, the xgb-specific persistence format was chosen, which can be used through estimator.save_model and estimator.load_model. I could not verify (yet) that this format is 100% secure. However, since it's a binary json file (.ubj), it should as secure as json. More information here: https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html The actual implementation for xgboost mirrors the one for numpy arrays, which also stores its data in an external file. However, I did not apply any caching here, as it seems it wouldn't be useful. I decided to create a new test file, test_external.py, to put the new tests, instead of cramming more into test_persist.py. Since some functions from test_persist were required here, I moved them a new _utils.py module. Interestingly, I found some potential bugs where some values returned by get_params may differ for xgboost models after loading. This is independent of skops. Those edge cases are documented but were excluded from tests, as it appears that they should be inconsequential. --- docs/persistence.rst | 18 ++ skops/_min_dependencies.py | 3 + skops/io/_persist.py | 2 +- skops/io/_xgboost.py | 88 ++++++++++ skops/io/tests/_utils.py | 145 ++++++++++++++++ skops/io/tests/test_external.py | 296 ++++++++++++++++++++++++++++++++ skops/io/tests/test_persist.py | 143 +-------------- 7 files changed, 553 insertions(+), 142 deletions(-) create mode 100644 skops/io/_xgboost.py create mode 100644 skops/io/tests/_utils.py create mode 100644 skops/io/tests/test_external.py diff --git a/docs/persistence.rst b/docs/persistence.rst index 7fced29b..7bc65516 100644 --- a/docs/persistence.rst +++ b/docs/persistence.rst @@ -87,6 +87,24 @@ means if you have custom functions (say, a custom function to be used with most ``numpy`` and ``scipy`` functions should work. Therefore, you can actually save built-in functions like ``numpy.sqrt``. +Supported libraries +------------------- + +Skops intends to support all of **scikit-learn**, that is, not only its +estimators, but also other classes like cross validation splitters. Furthermore, +most types from **numpy** and **scipy** should be supported, such as (sparse) +arrays, dtypes, random generators, and ufuncs. + +Apart from this core, we plan to support machine learning libraries commonly +used be the community. So far, those are: + +- `LightGBM `_ (scikit-learn API) + +If you run into a problem using any of the mentioned libraries, this could mean +there is a bug in skops. Please open an issue on `our issue tracker +`_ (but please check first if a +corresponding issue already exists). + Roadmap ------- diff --git a/skops/_min_dependencies.py b/skops/_min_dependencies.py index a3f1ced7..83c92b6a 100644 --- a/skops/_min_dependencies.py +++ b/skops/_min_dependencies.py @@ -27,6 +27,9 @@ "matplotlib": ("3.3", "docs, tests", None), "pandas": ("1", "docs, tests", None), "typing_extensions": ("3.7", "install", "python_full_version < '3.8'"), + # required for persistence tests of external libraries + "lightgbm": ("3", "tests", None), + "xgboost": ("1.7", "tests", None), } diff --git a/skops/io/_persist.py b/skops/io/_persist.py index e802e840..d79a0d42 100644 --- a/skops/io/_persist.py +++ b/skops/io/_persist.py @@ -14,7 +14,7 @@ # We load the dispatch functions from the corresponding modules and register # them. -modules = ["._general", "._numpy", "._scipy", "._sklearn"] +modules = ["._general", "._numpy", "._scipy", "._sklearn", "._xgboost"] for module_name in modules: # register exposed functions for get_state and get_tree module = importlib.import_module(module_name, package="skops.io") diff --git a/skops/io/_xgboost.py b/skops/io/_xgboost.py new file mode 100644 index 00000000..8da4c5ba --- /dev/null +++ b/skops/io/_xgboost.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import io +import tempfile +from typing import Any, Sequence +from uuid import uuid4 + +import sklearn.exceptions + +from ._audit import Node, get_tree +from ._utils import LoadContext, SaveContext, get_module, get_state + +try: + import xgboost +except ImportError: + xgboost = None # type: ignore + + +def xgboost_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: + res = { + "__class__": obj.__class__.__name__, + "__module__": get_module(type(obj)), + "__loader__": "XGBoostNode", + "constructor": type(obj).__name__, + } + + # fitted vs not fitted models require different approaches + try: + # xgboost doesn't allow to save to a memory buffer, so take roundtrip + # through temp file + tmp_file = f"{tempfile.mkdtemp()}.ubj" + obj.save_model(tmp_file) + with open(tmp_file, "rb") as f: + data_buffer = io.BytesIO(f.read()) + f_name = f"xgboost_{uuid4()}.json" + save_context.zip_file.writestr(f_name, data_buffer.getbuffer()) + res.update(type="xgboost-format", file=f_name) + except sklearn.exceptions.NotFittedError: + res.update(type="params", content=get_state(obj.get_params(), save_context)) + + return res + + +class XGBoostNode(Node): + def __init__( + self, + state: dict[str, Any], + load_context: LoadContext, + trusted: bool | Sequence[str] = False, + ) -> None: + super().__init__(state, load_context, trusted) + self.type = state["type"] + self.trusted = self._get_trusted(trusted, []) + + # list of constructors is hard-coded for higher security + constructors = { + "XGBClassifier": xgboost.XGBClassifier, + "XGBRegressor": xgboost.XGBRegressor, + "XGBRFClassifier": xgboost.XGBRFClassifier, + "XGBRFRegressor": xgboost.XGBRFRegressor, + "XGBRanker": xgboost.XGBRanker, + } + self.children = {"constructor": constructors[state["constructor"]]} + if self.type == "xgboost-format": # fitted xgboost model + self.children["content"] = io.BytesIO(load_context.src.read(state["file"])) + else: + self.children["content"] = get_tree(state["content"], load_context) + + def _construct(self): + cls = self.children["constructor"] + instance = cls() + + if self.type == "xgboost-format": # fitted + # load_model works with bytearray, so no temp file necessary here + instance.load_model(bytearray(self.children["content"].getvalue())) + else: + params = self.children["content"].construct() + instance.set_params(**params) + + return instance + + +GET_STATE_DISPATCH_FUNCTIONS = [] # type: ignore +NODE_TYPE_MAPPING = {} + +if xgboost is not None: + GET_STATE_DISPATCH_FUNCTIONS.append((xgboost.sklearn.XGBModel, xgboost_get_state)) + NODE_TYPE_MAPPING["XGBoostNode"] = XGBoostNode diff --git a/skops/io/tests/_utils.py b/skops/io/tests/_utils.py new file mode 100644 index 00000000..fb870122 --- /dev/null +++ b/skops/io/tests/_utils.py @@ -0,0 +1,145 @@ +import warnings + +import numpy as np +from scipy import sparse +from sklearn.base import BaseEstimator + + +def _is_steps_like(obj): + # helper function to check if an object is something like Pipeline.steps, + # i.e. a list of tuples of names and estimators + if not isinstance(obj, list): # must be a list + return False + + if not obj: # must not be empty + return False + + if not isinstance(obj[0], tuple): # must be list of tuples + return False + + lens = set(map(len, obj)) + if not lens == {2}: # all elements must be length 2 tuples + return False + + keys, vals = list(zip(*obj)) + + if len(keys) != len(set(keys)): # keys must be unique + return False + + if not all(map(lambda x: isinstance(x, (type(None), BaseEstimator)), vals)): + # values must be BaseEstimators or None + return False + + return True + + +def _assert_generic_objects_equal(val1, val2): + def _is_builtin(val): + # Check if value is a builtin type + return getattr(getattr(val, "__class__", {}), "__module__", None) == "builtins" + + if isinstance(val1, (list, tuple, np.ndarray)): + assert len(val1) == len(val2) + for subval1, subval2 in zip(val1, val2): + _assert_generic_objects_equal(subval1, subval2) + return + + assert type(val1) == type(val2) + if hasattr(val1, "__dict__"): + assert_params_equal(val1.__dict__, val2.__dict__) + elif _is_builtin(val1): + assert val1 == val2 + else: + # not a normal Python class, could be e.g. a Cython class + assert val1.__reduce__() == val2.__reduce__() + + +def _assert_tuples_equal(val1, val2): + assert len(val1) == len(val2) + for subval1, subval2 in zip(val1, val2): + _assert_vals_equal(subval1, subval2) + + +def _assert_vals_equal(val1, val2): + if hasattr(val1, "__getstate__"): + # This includes BaseEstimator since they implement __getstate__ and + # that returns the parameters as well. + # + # Some objects return a tuple of parameters, others a dict. + state1 = val1.__getstate__() + state2 = val2.__getstate__() + assert type(state1) == type(state2) + if isinstance(state1, tuple): + _assert_tuples_equal(state1, state2) + else: + assert_params_equal(val1.__getstate__(), val2.__getstate__()) + elif sparse.issparse(val1): + assert sparse.issparse(val2) and ((val1 - val2).nnz == 0) + elif isinstance(val1, (np.ndarray, np.generic)): + if len(val1.dtype) == 0: + # for arrays with at least 2 dimensions, check that contiguity is + # preserved + if val1.squeeze().ndim > 1: + assert val1.flags["C_CONTIGUOUS"] is val2.flags["C_CONTIGUOUS"] + assert val1.flags["F_CONTIGUOUS"] is val2.flags["F_CONTIGUOUS"] + if val1.dtype == object: + assert val2.dtype == object + assert val1.shape == val2.shape + for subval1, subval2 in zip(val1, val2): + _assert_generic_objects_equal(subval1, subval2) + else: + # simple comparison of arrays with simple dtypes, almost all + # arrays are of this sort. + np.testing.assert_array_equal(val1, val2) + elif len(val1.shape) == 1: + # comparing arrays with structured dtypes, but they have to be 1D + # arrays. This is what we get from the Tree's state. + assert np.all([x == y for x, y in zip(val1, val2)]) + else: + # we don't know what to do with these values, for now. + assert False + elif isinstance(val1, (tuple, list)): + assert len(val1) == len(val2) + for subval1, subval2 in zip(val1, val2): + _assert_vals_equal(subval1, subval2) + elif isinstance(val1, float) and np.isnan(val1): + assert np.isnan(val2) + elif isinstance(val1, dict): + # dictionaries are compared by comparing their values recursively. + assert set(val1.keys()) == set(val2.keys()) + for key in val1: + _assert_vals_equal(val1[key], val2[key]) + elif hasattr(val1, "__dict__") and hasattr(val2, "__dict__"): + _assert_vals_equal(val1.__dict__, val2.__dict__) + elif isinstance(val1, np.ufunc): + assert val1 == val2 + elif val1.__class__.__module__ == "builtins": + assert val1 == val2 + else: + _assert_generic_objects_equal(val1, val2) + + +def assert_params_equal(params1, params2): + # helper function to compare estimator dictionaries of parameters + assert len(params1) == len(params2) + assert set(params1.keys()) == set(params2.keys()) + for key in params1: + with warnings.catch_warnings(): + # this is to silence the deprecation warning from _DictWithDeprecatedKeys + warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn") + val1, val2 = params1[key], params2[key] + assert type(val1) == type(val2) + + if _is_steps_like(val1): + # Deal with Pipeline.steps, FeatureUnion.transformer_list, etc. + assert _is_steps_like(val2) + val1, val2 = dict(val1), dict(val2) + + if isinstance(val1, (tuple, list)): + assert len(val1) == len(val2) + for subval1, subval2 in zip(val1, val2): + _assert_vals_equal(subval1, subval2) + elif isinstance(val1, dict): + assert_params_equal(val1, val2) + else: + _assert_vals_equal(val1, val2) diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py new file mode 100644 index 00000000..bf6c04b6 --- /dev/null +++ b/skops/io/tests/test_external.py @@ -0,0 +1,296 @@ +import sys + +import pytest +from sklearn.datasets import make_classification, make_regression +from sklearn.utils._testing import assert_allclose_dense_sparse + +from skops.io import dumps, loads +from skops.io.tests._utils import assert_params_equal + +ATOL = 1e-6 if sys.platform == "darwin" else 1e-7 +# Default settings for generated data +N_SAMPLES = 30 +N_FEATURES = 10 +N_CLASSES = 4 # for classification only + + +@pytest.fixture(scope="module") +def clf_data(): + X, y = make_classification( + n_samples=N_SAMPLES, + n_classes=N_CLASSES, + n_features=N_FEATURES, + random_state=0, + n_redundant=1, + n_informative=N_FEATURES - 1, + ) + return X, y + + +@pytest.fixture(scope="module") +def regr_data(): + X, y = make_regression(n_samples=N_SAMPLES, n_features=N_FEATURES, random_state=0) + return X, y + + +@pytest.fixture(scope="module") +def rank_data(clf_data): + X, y = clf_data + group = [10 for _ in range(N_SAMPLES // 10)] + n = sum(group) + if N_SAMPLES > n: + group[-1] += N_SAMPLES - n + assert sum(group) == N_SAMPLES + return X, y, group + + +def check_estimator(estimator, trusted): + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) + + +def check_method_outputs(estimator, X, trusted): + loaded = loads(dumps(estimator), trusted=trusted) + for method in [ + "predict", + "predict_proba", + "decision_function", + "transform", + "predict_log_proba", + ]: + err_msg = ( + f"{estimator.__class__.__name__}.{method}() doesn't produce the same" + " results after loading the persisted model." + ) + if hasattr(estimator, method): + X_out1 = getattr(estimator, method)(X) + X_out2 = getattr(loaded, method)(X) + assert_allclose_dense_sparse(X_out1, X_out2, err_msg=err_msg, atol=ATOL) + + +class TestLightGBM: + """Tests for LGBMClassifier, LGBMRegressor, LGBMRanker""" + + @pytest.fixture(autouse=True) + def lgbm(self): + lgbm = pytest.importorskip("lightgbm") + return lgbm + + @pytest.fixture + def trusted(self): + return [ + "collections.defaultdict", + "lightgbm.basic.Booster", + "lightgbm.sklearn.LGBMClassifier", + "lightgbm.sklearn.LGBMRegressor", + "lightgbm.sklearn.LGBMRanker", + "numpy.int64", + "sklearn.preprocessing._label.LabelEncoder", + ] + + boosting_types = ["gbdt", "dart", "goss", "rf"] + + @pytest.mark.parametrize("boosting_type", boosting_types) + def test_classifier(self, lgbm, clf_data, trusted, boosting_type): + kw = {} + if boosting_type == "rf": + kw["bagging_fraction"] = 0.5 + kw["bagging_freq"] = 2 + + estimator = lgbm.LGBMClassifier(boosting_type=boosting_type, **kw) + check_estimator(estimator, trusted=trusted) + + X, y = clf_data + estimator.fit(X, y) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + @pytest.mark.parametrize("boosting_type", boosting_types) + def test_regressor(self, lgbm, regr_data, trusted, boosting_type): + kw = {} + if boosting_type == "rf": + kw["bagging_fraction"] = 0.5 + kw["bagging_freq"] = 2 + + estimator = lgbm.LGBMRegressor(boosting_type=boosting_type, **kw) + check_estimator(estimator, trusted=trusted) + + X, y = regr_data + estimator.fit(X, y) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + @pytest.mark.parametrize("boosting_type", boosting_types) + def test_ranker(self, lgbm, rank_data, trusted, boosting_type): + kw = {} + if boosting_type == "rf": + kw["bagging_fraction"] = 0.5 + kw["bagging_freq"] = 2 + + estimator = lgbm.LGBMRanker(boosting_type=boosting_type, **kw) + check_estimator(estimator, trusted=trusted) + + X, y, group = rank_data + estimator.fit(X, y, group=group) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + +class TestXGBoost: + """Tests for XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor, XGBRanker + + Known bugs: + + - When initialzing with tree_method=None, its value resolves to "exact", but + after loading, it resolves to "auto". + - When initializing with gpu_id=None, its value resolves to 0, but after + loading, it resolves to -1. + + This can be verified like this: + + >>> import xgboost + >>> estimator = xgboost.XGBClassifier(tree_method=None) + >>> X, y = [[0, 1], [2, 3]], [0, 1] + >>> estimator.fit(X, y) + XGBClassifier(...) + >>> print(estimator.tree_method) + None + >>> print(estimator.get_params()["tree_method"]) + exact + >>> # after save/load roundtrip, values of get_params change + >>> import tempfile + >>> tmp_file = f"{tempfile.mkdtemp()}.ubj" + >>> estimator.save_model(tmp_file) + >>> estimator.load_model(tmp_file) + >>> print(estimator.tree_method) + None + >>> print(estimator.get_params()["tree_method"]) + auto + + >>> estimator = xgboost.XGBClassifier(tree_method='gpu_hist', booster='gbtree') + >>> estimator.fit(X, y) + XGBClassifier(...) + >>> print(estimator.gpu_id) + None + >>> print(estimator.get_params()["gpu_id"]) + 0 + >>> # after save/load roundtrip, values of get_params change + >>> estimator.save_model(tmp_file) + >>> # for gpu_id, the estimator needs to be re-initialized for the effect to occur + >>> estimator = xgboost.XGBClassifier() + >>> estimator.load_model(tmp_file) + >>> print(estimator.gpu_id) + None + >>> print(estimator.get_params()["gpu_id"]) + -1 + + As can be seen, this has nothing to do with skops but is a bug/feature of + xgboost. We assume that this has no practical consequences and thus avoid + testing these cases. + + """ + + @pytest.fixture(autouse=True) + def xgboost(self): + xgboost = pytest.importorskip("xgboost") + return xgboost + + @pytest.fixture + def trusted(self): + return [ + "xgboost.sklearn.XGBClassifier", + "xgboost.sklearn.XGBRegressor", + "xgboost.sklearn.XGBRFClassifier", + "xgboost.sklearn.XGBRFRegressor", + "xgboost.sklearn.XGBRanker", + "builtins.bytearray", + "xgboost.core.Booster", + ] + + boosters = ["gbtree", "gblinear", "dart"] + tree_methods = ["approx", "hist", "gpu_hist", "auto"] + + @pytest.mark.parametrize("booster", boosters) + @pytest.mark.parametrize("tree_method", tree_methods) + def test_classifier(self, xgboost, clf_data, trusted, booster, tree_method): + if (booster == "gblinear") and (tree_method != "approx"): + # This parameter combination is not supported in XGBoost + return + + estimator = xgboost.XGBClassifier( + booster=booster, tree_method=tree_method, gpu_id=-1 + ) + check_estimator(estimator, trusted=trusted) + + X, y = clf_data + estimator.fit(X, y) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + @pytest.mark.parametrize("booster", boosters) + @pytest.mark.parametrize("tree_method", tree_methods) + def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method): + if (booster == "gblinear") and (tree_method != "approx"): + # This parameter combination is not supported in XGBoost + return + + estimator = xgboost.XGBRegressor( + booster=booster, tree_method=tree_method, gpu_id=-1 + ) + check_estimator(estimator, trusted=trusted) + + X, y = regr_data + estimator.fit(X, y) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + @pytest.mark.parametrize("booster", boosters) + @pytest.mark.parametrize("tree_method", tree_methods) + def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method): + if (booster == "gblinear") and (tree_method != "approx"): + # This parameter combination is not supported in XGBoost + return + + estimator = xgboost.XGBRFClassifier( + booster=booster, tree_method=tree_method, gpu_id=-1 + ) + check_estimator(estimator, trusted=trusted) + + X, y = clf_data + estimator.fit(X, y) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + @pytest.mark.parametrize("booster", boosters) + @pytest.mark.parametrize("tree_method", tree_methods) + def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method): + if (booster == "gblinear") and (tree_method != "approx"): + # This parameter combination is not supported in XGBoost + return + + estimator = xgboost.XGBRFRegressor( + booster=booster, tree_method=tree_method, gpu_id=-1 + ) + check_estimator(estimator, trusted=trusted) + + X, y = regr_data + estimator.fit(X, y) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + @pytest.mark.parametrize("booster", boosters) + @pytest.mark.parametrize("tree_method", tree_methods) + def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method): + if (booster == "gblinear") and (tree_method != "approx"): + # This parameter combination is not supported in XGBoost + return + + estimator = xgboost.XGBRanker( + booster=booster, tree_method=tree_method, gpu_id=-1 + ) + check_estimator(estimator, trusted=trusted) + + X, y, group = rank_data + estimator.fit(X, y, group=group) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index c38871c6..679f93a8 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -58,6 +58,7 @@ from skops.io._sklearn import UNSUPPORTED_TYPES from skops.io._utils import LoadContext, SaveContext, _get_state, get_state from skops.io.exceptions import UnsupportedTypeException +from skops.io.tests._utils import assert_params_equal # Default settings for X N_SAMPLES = 50 @@ -112,7 +113,7 @@ def wrapper(state, load_context, trusted): return wrapper - modules = ["._general", "._numpy", "._scipy", "._sklearn"] + modules = ["._general", "._numpy", "._scipy", "._sklearn", "._xgboost"] for module_name in modules: # overwrite exposed functions for get_state and get_tree module = importlib.import_module(module_name, package="skops.io") @@ -257,146 +258,6 @@ def _unsupported_estimators(type_filter=None): yield estimator -def _is_steps_like(obj): - # helper function to check if an object is something like Pipeline.steps, - # i.e. a list of tuples of names and estimators - if not isinstance(obj, list): # must be a list - return False - - if not obj: # must not be empty - return False - - if not isinstance(obj[0], tuple): # must be list of tuples - return False - - lens = set(map(len, obj)) - if not lens == {2}: # all elements must be length 2 tuples - return False - - keys, vals = list(zip(*obj)) - - if len(keys) != len(set(keys)): # keys must be unique - return False - - if not all(map(lambda x: isinstance(x, (type(None), BaseEstimator)), vals)): - # values must be BaseEstimators or None - return False - - return True - - -def _assert_generic_objects_equal(val1, val2): - def _is_builtin(val): - # Check if value is a builtin type - return getattr(getattr(val, "__class__", {}), "__module__", None) == "builtins" - - if isinstance(val1, (list, tuple, np.ndarray)): - assert len(val1) == len(val2) - for subval1, subval2 in zip(val1, val2): - _assert_generic_objects_equal(subval1, subval2) - return - - assert type(val1) == type(val2) - if hasattr(val1, "__dict__"): - assert_params_equal(val1.__dict__, val2.__dict__) - elif _is_builtin(val1): - assert val1 == val2 - else: - # not a normal Python class, could be e.g. a Cython class - assert val1.__reduce__() == val2.__reduce__() - - -def _assert_tuples_equal(val1, val2): - assert len(val1) == len(val2) - for subval1, subval2 in zip(val1, val2): - _assert_vals_equal(subval1, subval2) - - -def _assert_vals_equal(val1, val2): - if hasattr(val1, "__getstate__"): - # This includes BaseEstimator since they implement __getstate__ and - # that returns the parameters as well. - # - # Some objects return a tuple of parameters, others a dict. - state1 = val1.__getstate__() - state2 = val2.__getstate__() - assert type(state1) == type(state2) - if isinstance(state1, tuple): - _assert_tuples_equal(state1, state2) - else: - assert_params_equal(val1.__getstate__(), val2.__getstate__()) - elif sparse.issparse(val1): - assert sparse.issparse(val2) and ((val1 - val2).nnz == 0) - elif isinstance(val1, (np.ndarray, np.generic)): - if len(val1.dtype) == 0: - # for arrays with at least 2 dimensions, check that contiguity is - # preserved - if val1.squeeze().ndim > 1: - assert val1.flags["C_CONTIGUOUS"] is val2.flags["C_CONTIGUOUS"] - assert val1.flags["F_CONTIGUOUS"] is val2.flags["F_CONTIGUOUS"] - if val1.dtype == object: - assert val2.dtype == object - assert val1.shape == val2.shape - for subval1, subval2 in zip(val1, val2): - _assert_generic_objects_equal(subval1, subval2) - else: - # simple comparison of arrays with simple dtypes, almost all - # arrays are of this sort. - np.testing.assert_array_equal(val1, val2) - elif len(val1.shape) == 1: - # comparing arrays with structured dtypes, but they have to be 1D - # arrays. This is what we get from the Tree's state. - assert np.all([x == y for x, y in zip(val1, val2)]) - else: - # we don't know what to do with these values, for now. - assert False - elif isinstance(val1, (tuple, list)): - assert len(val1) == len(val2) - for subval1, subval2 in zip(val1, val2): - _assert_vals_equal(subval1, subval2) - elif isinstance(val1, float) and np.isnan(val1): - assert np.isnan(val2) - elif isinstance(val1, dict): - # dictionaries are compared by comparing their values recursively. - assert set(val1.keys()) == set(val2.keys()) - for key in val1: - _assert_vals_equal(val1[key], val2[key]) - elif hasattr(val1, "__dict__") and hasattr(val2, "__dict__"): - _assert_vals_equal(val1.__dict__, val2.__dict__) - elif isinstance(val1, np.ufunc): - assert val1 == val2 - elif val1.__class__.__module__ == "builtins": - assert val1 == val2 - else: - _assert_generic_objects_equal(val1, val2) - - -def assert_params_equal(params1, params2): - # helper function to compare estimator dictionaries of parameters - assert len(params1) == len(params2) - assert set(params1.keys()) == set(params2.keys()) - for key in params1: - with warnings.catch_warnings(): - # this is to silence the deprecation warning from _DictWithDeprecatedKeys - warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn") - val1, val2 = params1[key], params2[key] - assert type(val1) == type(val2) - - if _is_steps_like(val1): - # Deal with Pipeline.steps, FeatureUnion.transformer_list, etc. - assert _is_steps_like(val2) - val1, val2 = dict(val1), dict(val2) - - if isinstance(val1, (tuple, list)): - assert len(val1) == len(val2) - for subval1, subval2 in zip(val1, val2): - _assert_vals_equal(subval1, subval2) - elif isinstance(val1, dict): - assert_params_equal(val1, val2) - else: - _assert_vals_equal(val1, val2) - - @pytest.mark.parametrize( "estimator", _tested_estimators(), ids=_get_check_estimator_ids ) From 1c138279bff6ab5c94f0bfa97e7c6609dfc10291 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 8 Dec 2022 17:09:06 +0100 Subject: [PATCH 02/12] Add entry to changes.rst --- docs/changes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changes.rst b/docs/changes.rst index 034074a4..082bff80 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -14,6 +14,8 @@ v0.4 - :func:`.io.dump` and :func:`.io.load` now work with file like objects, which means you can use them with the ``with open(...) as f: dump(obj, f)`` pattern, like you'd do with ``pickle``. :pr:`234` by `Benjamin Bossan`_. +- Persistence now supports LightGBM and XGBoost. :pr:`244` by `Benjamin + Bossan`_. v0.3 ---- From 3196f6ae23d21443a278f9007093c7da95de093b Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 8 Dec 2022 17:11:41 +0100 Subject: [PATCH 03/12] Lower min version of XGBoost to 1.6 1.7 not supported for Python 3.7 --- skops/_min_dependencies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skops/_min_dependencies.py b/skops/_min_dependencies.py index 83c92b6a..2065cb14 100644 --- a/skops/_min_dependencies.py +++ b/skops/_min_dependencies.py @@ -29,7 +29,7 @@ "typing_extensions": ("3.7", "install", "python_full_version < '3.8'"), # required for persistence tests of external libraries "lightgbm": ("3", "tests", None), - "xgboost": ("1.7", "tests", None), + "xgboost": ("1.6", "tests", None), } From 63e702ee6768189195fd1a00771a7bb55c9e021d Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 8 Dec 2022 17:20:07 +0100 Subject: [PATCH 04/12] Remove gpu_hist from tested parameters Fails when no GPU is available and I couldn't find a way to detect from Python if a GPU is available or not (short of trying and catching the error). --- skops/io/tests/test_external.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index bf6c04b6..6fbfd610 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -143,8 +143,8 @@ class TestXGBoost: - When initialzing with tree_method=None, its value resolves to "exact", but after loading, it resolves to "auto". - - When initializing with gpu_id=None, its value resolves to 0, but after - loading, it resolves to -1. + - When initializing with tree_method='gpu_hist' and gpu_id=None, the + latter's value resolves to 0, but after loading, it resolves to -1. This can be verified like this: @@ -208,7 +208,7 @@ def trusted(self): ] boosters = ["gbtree", "gblinear", "dart"] - tree_methods = ["approx", "hist", "gpu_hist", "auto"] + tree_methods = ["approx", "hist", "auto"] @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) @@ -217,9 +217,7 @@ def test_classifier(self, xgboost, clf_data, trusted, booster, tree_method): # This parameter combination is not supported in XGBoost return - estimator = xgboost.XGBClassifier( - booster=booster, tree_method=tree_method, gpu_id=-1 - ) + estimator = xgboost.XGBClassifier(booster=booster, tree_method=tree_method) check_estimator(estimator, trusted=trusted) X, y = clf_data @@ -234,9 +232,7 @@ def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method): # This parameter combination is not supported in XGBoost return - estimator = xgboost.XGBRegressor( - booster=booster, tree_method=tree_method, gpu_id=-1 - ) + estimator = xgboost.XGBRegressor(booster=booster, tree_method=tree_method) check_estimator(estimator, trusted=trusted) X, y = regr_data @@ -251,9 +247,7 @@ def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method): # This parameter combination is not supported in XGBoost return - estimator = xgboost.XGBRFClassifier( - booster=booster, tree_method=tree_method, gpu_id=-1 - ) + estimator = xgboost.XGBRFClassifier(booster=booster, tree_method=tree_method) check_estimator(estimator, trusted=trusted) X, y = clf_data @@ -268,9 +262,7 @@ def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method): # This parameter combination is not supported in XGBoost return - estimator = xgboost.XGBRFRegressor( - booster=booster, tree_method=tree_method, gpu_id=-1 - ) + estimator = xgboost.XGBRFRegressor(booster=booster, tree_method=tree_method) check_estimator(estimator, trusted=trusted) X, y = regr_data @@ -285,9 +277,7 @@ def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method): # This parameter combination is not supported in XGBoost return - estimator = xgboost.XGBRanker( - booster=booster, tree_method=tree_method, gpu_id=-1 - ) + estimator = xgboost.XGBRanker(booster=booster, tree_method=tree_method) check_estimator(estimator, trusted=trusted) X, y, group = rank_data From 9616b70913ae0a160f30c8982c874680fc9ca9ae Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 8 Dec 2022 17:41:06 +0100 Subject: [PATCH 05/12] Trust np.int32 in lgbm test Apparently, on Windows, np.int32 is used, not np.int64. --- skops/io/tests/test_external.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index 6fbfd610..0797d4a8 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -78,12 +78,14 @@ def lgbm(self): @pytest.fixture def trusted(self): + # TODO: adjust once more types are trusted by default return [ "collections.defaultdict", "lightgbm.basic.Booster", "lightgbm.sklearn.LGBMClassifier", "lightgbm.sklearn.LGBMRegressor", "lightgbm.sklearn.LGBMRanker", + "numpy.int32", "numpy.int64", "sklearn.preprocessing._label.LabelEncoder", ] @@ -197,6 +199,7 @@ def xgboost(self): @pytest.fixture def trusted(self): + # TODO: adjust once more types are trusted by default return [ "xgboost.sklearn.XGBClassifier", "xgboost.sklearn.XGBRegressor", From 2a73e49485dad71bce11ff90fe4d688979191581 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 9 Dec 2022 13:53:25 +0100 Subject: [PATCH 06/12] Remove example from docstring replicating xgb bug --- skops/io/tests/test_external.py | 51 +++++---------------------------- 1 file changed, 7 insertions(+), 44 deletions(-) diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index 0797d4a8..4bfc0228 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -144,51 +144,14 @@ class TestXGBoost: Known bugs: - When initialzing with tree_method=None, its value resolves to "exact", but - after loading, it resolves to "auto". + after loading, it resolves to "auto" when calling get_params(). - When initializing with tree_method='gpu_hist' and gpu_id=None, the - latter's value resolves to 0, but after loading, it resolves to -1. - - This can be verified like this: - - >>> import xgboost - >>> estimator = xgboost.XGBClassifier(tree_method=None) - >>> X, y = [[0, 1], [2, 3]], [0, 1] - >>> estimator.fit(X, y) - XGBClassifier(...) - >>> print(estimator.tree_method) - None - >>> print(estimator.get_params()["tree_method"]) - exact - >>> # after save/load roundtrip, values of get_params change - >>> import tempfile - >>> tmp_file = f"{tempfile.mkdtemp()}.ubj" - >>> estimator.save_model(tmp_file) - >>> estimator.load_model(tmp_file) - >>> print(estimator.tree_method) - None - >>> print(estimator.get_params()["tree_method"]) - auto - - >>> estimator = xgboost.XGBClassifier(tree_method='gpu_hist', booster='gbtree') - >>> estimator.fit(X, y) - XGBClassifier(...) - >>> print(estimator.gpu_id) - None - >>> print(estimator.get_params()["gpu_id"]) - 0 - >>> # after save/load roundtrip, values of get_params change - >>> estimator.save_model(tmp_file) - >>> # for gpu_id, the estimator needs to be re-initialized for the effect to occur - >>> estimator = xgboost.XGBClassifier() - >>> estimator.load_model(tmp_file) - >>> print(estimator.gpu_id) - None - >>> print(estimator.get_params()["gpu_id"]) - -1 - - As can be seen, this has nothing to do with skops but is a bug/feature of - xgboost. We assume that this has no practical consequences and thus avoid - testing these cases. + latter's value resolves to 0, but after loading, it resolves to -1, when + calling get_params() + + These discrepancies occur regardless of skops, so they're a problem in + xgboost itself. We assume that this has no practical consequences and thus + avoid testing these cases. """ From 7fad01c829b3b24c2044b2e9a34b07ccf3688c42 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Mon, 12 Dec 2022 16:57:08 +0100 Subject: [PATCH 07/12] Use bytearray instead of xgboost custom format Added support for bytes and bytearray, which is sufficient to dump and load xgboost. The xgboost specific code was commented out for now. To serialize bytes in json, base64 encoding was used. --- skops/io/_general.py | 46 +++++++++ skops/io/_xgboost.py | 168 ++++++++++++++++----------------- skops/io/tests/test_persist.py | 13 +++ 3 files changed, 143 insertions(+), 84 deletions(-) diff --git a/skops/io/_general.py b/skops/io/_general.py index 7f27fbcb..099c68fa 100644 --- a/skops/io/_general.py +++ b/skops/io/_general.py @@ -1,5 +1,6 @@ from __future__ import annotations +import base64 import json from functools import partial from types import FunctionType, MethodType @@ -475,12 +476,55 @@ def _construct(self): return json.loads(self.content) +def bytes_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: + content = base64.b64encode(obj).decode("ascii") + res = { + "__class__": obj.__class__.__name__, + "__module__": get_module(type(obj)), + "__loader__": "BytesNode", + "content": get_state(content, save_context), + } + return res + + +def bytearray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: + res = bytes_get_state(obj, save_context) + res["__loader__"] = "BytearrayNode" + return res + + +class BytesNode(Node): + def __init__( + self, + state: dict[str, Any], + load_context: LoadContext, + trusted: bool | Sequence[str] = False, + ) -> None: + super().__init__(state, load_context, trusted) + self.trusted = self._get_trusted(trusted, []) + self.children = {"content": get_tree(state["content"], load_context)} + + def _construct(self): + content_str = self.children["content"].construct() + content_bytes = base64.b64decode(content_str) + return content_bytes + + +class BytearrayNode(BytesNode): + def _construct(self): + content_bytes = super()._construct() + content_bytearray = bytearray(content_bytes) + return content_bytearray + + # tuples of type and function that gets the state of that type GET_STATE_DISPATCH_FUNCTIONS = [ (dict, dict_get_state), (list, list_get_state), (set, set_get_state), (tuple, tuple_get_state), + (bytes, bytes_get_state), + (bytearray, bytearray_get_state), (slice, slice_get_state), (FunctionType, function_get_state), (MethodType, method_get_state), @@ -494,6 +538,8 @@ def _construct(self): "ListNode": ListNode, "SetNode": SetNode, "TupleNode": TupleNode, + "BytesNode": BytesNode, + "BytearrayNode": BytearrayNode, "SliceNode": SliceNode, "FunctionNode": FunctionNode, "MethodNode": MethodNode, diff --git a/skops/io/_xgboost.py b/skops/io/_xgboost.py index 8da4c5ba..b2ef870e 100644 --- a/skops/io/_xgboost.py +++ b/skops/io/_xgboost.py @@ -1,88 +1,88 @@ -from __future__ import annotations - -import io -import tempfile -from typing import Any, Sequence -from uuid import uuid4 - -import sklearn.exceptions - -from ._audit import Node, get_tree -from ._utils import LoadContext, SaveContext, get_module, get_state - -try: - import xgboost -except ImportError: - xgboost = None # type: ignore - - -def xgboost_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: - res = { - "__class__": obj.__class__.__name__, - "__module__": get_module(type(obj)), - "__loader__": "XGBoostNode", - "constructor": type(obj).__name__, - } - - # fitted vs not fitted models require different approaches - try: - # xgboost doesn't allow to save to a memory buffer, so take roundtrip - # through temp file - tmp_file = f"{tempfile.mkdtemp()}.ubj" - obj.save_model(tmp_file) - with open(tmp_file, "rb") as f: - data_buffer = io.BytesIO(f.read()) - f_name = f"xgboost_{uuid4()}.json" - save_context.zip_file.writestr(f_name, data_buffer.getbuffer()) - res.update(type="xgboost-format", file=f_name) - except sklearn.exceptions.NotFittedError: - res.update(type="params", content=get_state(obj.get_params(), save_context)) - - return res - - -class XGBoostNode(Node): - def __init__( - self, - state: dict[str, Any], - load_context: LoadContext, - trusted: bool | Sequence[str] = False, - ) -> None: - super().__init__(state, load_context, trusted) - self.type = state["type"] - self.trusted = self._get_trusted(trusted, []) - - # list of constructors is hard-coded for higher security - constructors = { - "XGBClassifier": xgboost.XGBClassifier, - "XGBRegressor": xgboost.XGBRegressor, - "XGBRFClassifier": xgboost.XGBRFClassifier, - "XGBRFRegressor": xgboost.XGBRFRegressor, - "XGBRanker": xgboost.XGBRanker, - } - self.children = {"constructor": constructors[state["constructor"]]} - if self.type == "xgboost-format": # fitted xgboost model - self.children["content"] = io.BytesIO(load_context.src.read(state["file"])) - else: - self.children["content"] = get_tree(state["content"], load_context) - - def _construct(self): - cls = self.children["constructor"] - instance = cls() - - if self.type == "xgboost-format": # fitted - # load_model works with bytearray, so no temp file necessary here - instance.load_model(bytearray(self.children["content"].getvalue())) - else: - params = self.children["content"].construct() - instance.set_params(**params) - - return instance +# from __future__ import annotations + +# import io +# import tempfile +# from typing import Any, Sequence +# from uuid import uuid4 + +# import sklearn.exceptions + +# from ._audit import Node, get_tree +# from ._utils import LoadContext, SaveContext, get_module, get_state + +# try: +# import xgboost +# except ImportError: +# xgboost = None # type: ignore + + +# def xgboost_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: +# res = { +# "__class__": obj.__class__.__name__, +# "__module__": get_module(type(obj)), +# "__loader__": "XGBoostNode", +# "constructor": type(obj).__name__, +# } + +# # fitted vs not fitted models require different approaches +# try: +# # xgboost doesn't allow to save to a memory buffer, so take roundtrip +# # through temp file +# tmp_file = f"{tempfile.mkdtemp()}.ubj" +# obj.save_model(tmp_file) +# with open(tmp_file, "rb") as f: +# data_buffer = io.BytesIO(f.read()) +# f_name = f"xgboost_{uuid4()}.json" +# save_context.zip_file.writestr(f_name, data_buffer.getbuffer()) +# res.update(type="xgboost-format", file=f_name) +# except sklearn.exceptions.NotFittedError: +# res.update(type="params", content=get_state(obj.get_params(), save_context)) + +# return res + + +# class XGBoostNode(Node): +# def __init__( +# self, +# state: dict[str, Any], +# load_context: LoadContext, +# trusted: bool | Sequence[str] = False, +# ) -> None: +# super().__init__(state, load_context, trusted) +# self.type = state["type"] +# self.trusted = self._get_trusted(trusted, []) + +# # list of constructors is hard-coded for higher security +# constructors = { +# "XGBClassifier": xgboost.XGBClassifier, +# "XGBRegressor": xgboost.XGBRegressor, +# "XGBRFClassifier": xgboost.XGBRFClassifier, +# "XGBRFRegressor": xgboost.XGBRFRegressor, +# "XGBRanker": xgboost.XGBRanker, +# } +# self.children = {"constructor": constructors[state["constructor"]]} +# if self.type == "xgboost-format": # fitted xgboost model +# self.children["content"] = io.BytesIO(load_context.src.read(state["file"])) +# else: +# self.children["content"] = get_tree(state["content"], load_context) + +# def _construct(self): +# cls = self.children["constructor"] +# instance = cls() + +# if self.type == "xgboost-format": # fitted +# # load_model works with bytearray, so no temp file necessary here +# instance.load_model(bytearray(self.children["content"].getvalue())) +# else: +# params = self.children["content"].construct() +# instance.set_params(**params) + +# return instance GET_STATE_DISPATCH_FUNCTIONS = [] # type: ignore -NODE_TYPE_MAPPING = {} +NODE_TYPE_MAPPING = {} # type: ignore -if xgboost is not None: - GET_STATE_DISPATCH_FUNCTIONS.append((xgboost.sklearn.XGBModel, xgboost_get_state)) - NODE_TYPE_MAPPING["XGBoostNode"] = XGBoostNode +# if xgboost is not None: +# GET_STATE_DISPATCH_FUNCTIONS.append((xgboost.sklearn.XGBModel, xgboost_get_state)) +# NODE_TYPE_MAPPING["XGBoostNode"] = XGBoostNode diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 679f93a8..b974c91a 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -860,3 +860,16 @@ def test_when_given_object_referenced_twice_loads_as_one_object(obj): persisted_object = loads(dumps(an_object), trusted=True) assert persisted_object["obj_1"] is persisted_object["obj_2"] + + +class EstimatorWithBytes(BaseEstimator): + def fit(self, X, y, **fit_params): + self.bytes_ = b"hello" + self.bytearray_ = bytearray([0, 1, 2, 253, 254, 255]) + return self + + +def test_estimator_with_bytes(): + est = EstimatorWithBytes().fit(None, None) + loaded = loads(dumps(est), trusted=True) + assert_params_equal(est.__dict__, loaded.__dict__) From 9b4a9bf1c76b6d7276c643211d3717e2d58541dd Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 13 Dec 2022 12:21:57 +0100 Subject: [PATCH 08/12] Save bytes in file, not in schema --- docs/changes.rst | 4 +- skops/io/_general.py | 17 +++---- skops/io/_persist.py | 2 +- skops/io/_xgboost.py | 88 ---------------------------------- skops/io/tests/test_persist.py | 15 +++++- 5 files changed, 26 insertions(+), 100 deletions(-) delete mode 100644 skops/io/_xgboost.py diff --git a/docs/changes.rst b/docs/changes.rst index de81433f..d1d01360 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -16,8 +16,8 @@ v0.4 pattern, like you'd do with ``pickle``. :pr:`234` by `Benjamin Bossan`_. - All `scikit-learn` estimators are trusted by default. :pr:`237` by :user:`Edoardo Abati `. -- Persistence now supports LightGBM and XGBoost. :pr:`244` by `Benjamin - Bossan`_. +- Persistence now supports bytes and bytearrays, added tests to verify that + LightGBM and XGBoost work. :pr:`244` by `Benjamin Bossan`_. v0.3 ---- diff --git a/skops/io/_general.py b/skops/io/_general.py index fff390b2..77bd99cf 100644 --- a/skops/io/_general.py +++ b/skops/io/_general.py @@ -1,7 +1,8 @@ from __future__ import annotations -import base64 +import io import json +import uuid from functools import partial from types import FunctionType, MethodType from typing import Any, Sequence @@ -477,12 +478,13 @@ def _construct(self): def bytes_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: - content = base64.b64encode(obj).decode("ascii") + f_name = f"{uuid.uuid4()}.bin" + save_context.zip_file.writestr(f_name, obj) res = { "__class__": obj.__class__.__name__, "__module__": get_module(type(obj)), "__loader__": "BytesNode", - "content": get_state(content, save_context), + "file": f_name, } return res @@ -502,18 +504,17 @@ def __init__( ) -> None: super().__init__(state, load_context, trusted) self.trusted = self._get_trusted(trusted, []) - self.children = {"content": get_tree(state["content"], load_context)} + self.children = {"content": io.BytesIO(load_context.src.read(state["file"]))} def _construct(self): - content_str = self.children["content"].construct() - content_bytes = base64.b64decode(content_str) - return content_bytes + content = self.children["content"].getvalue() + return content class BytearrayNode(BytesNode): def _construct(self): content_bytes = super()._construct() - content_bytearray = bytearray(content_bytes) + content_bytearray = bytearray(list(content_bytes)) return content_bytearray diff --git a/skops/io/_persist.py b/skops/io/_persist.py index d79a0d42..e802e840 100644 --- a/skops/io/_persist.py +++ b/skops/io/_persist.py @@ -14,7 +14,7 @@ # We load the dispatch functions from the corresponding modules and register # them. -modules = ["._general", "._numpy", "._scipy", "._sklearn", "._xgboost"] +modules = ["._general", "._numpy", "._scipy", "._sklearn"] for module_name in modules: # register exposed functions for get_state and get_tree module = importlib.import_module(module_name, package="skops.io") diff --git a/skops/io/_xgboost.py b/skops/io/_xgboost.py deleted file mode 100644 index b2ef870e..00000000 --- a/skops/io/_xgboost.py +++ /dev/null @@ -1,88 +0,0 @@ -# from __future__ import annotations - -# import io -# import tempfile -# from typing import Any, Sequence -# from uuid import uuid4 - -# import sklearn.exceptions - -# from ._audit import Node, get_tree -# from ._utils import LoadContext, SaveContext, get_module, get_state - -# try: -# import xgboost -# except ImportError: -# xgboost = None # type: ignore - - -# def xgboost_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: -# res = { -# "__class__": obj.__class__.__name__, -# "__module__": get_module(type(obj)), -# "__loader__": "XGBoostNode", -# "constructor": type(obj).__name__, -# } - -# # fitted vs not fitted models require different approaches -# try: -# # xgboost doesn't allow to save to a memory buffer, so take roundtrip -# # through temp file -# tmp_file = f"{tempfile.mkdtemp()}.ubj" -# obj.save_model(tmp_file) -# with open(tmp_file, "rb") as f: -# data_buffer = io.BytesIO(f.read()) -# f_name = f"xgboost_{uuid4()}.json" -# save_context.zip_file.writestr(f_name, data_buffer.getbuffer()) -# res.update(type="xgboost-format", file=f_name) -# except sklearn.exceptions.NotFittedError: -# res.update(type="params", content=get_state(obj.get_params(), save_context)) - -# return res - - -# class XGBoostNode(Node): -# def __init__( -# self, -# state: dict[str, Any], -# load_context: LoadContext, -# trusted: bool | Sequence[str] = False, -# ) -> None: -# super().__init__(state, load_context, trusted) -# self.type = state["type"] -# self.trusted = self._get_trusted(trusted, []) - -# # list of constructors is hard-coded for higher security -# constructors = { -# "XGBClassifier": xgboost.XGBClassifier, -# "XGBRegressor": xgboost.XGBRegressor, -# "XGBRFClassifier": xgboost.XGBRFClassifier, -# "XGBRFRegressor": xgboost.XGBRFRegressor, -# "XGBRanker": xgboost.XGBRanker, -# } -# self.children = {"constructor": constructors[state["constructor"]]} -# if self.type == "xgboost-format": # fitted xgboost model -# self.children["content"] = io.BytesIO(load_context.src.read(state["file"])) -# else: -# self.children["content"] = get_tree(state["content"], load_context) - -# def _construct(self): -# cls = self.children["constructor"] -# instance = cls() - -# if self.type == "xgboost-format": # fitted -# # load_model works with bytearray, so no temp file necessary here -# instance.load_model(bytearray(self.children["content"].getvalue())) -# else: -# params = self.children["content"].construct() -# instance.set_params(**params) - -# return instance - - -GET_STATE_DISPATCH_FUNCTIONS = [] # type: ignore -NODE_TYPE_MAPPING = {} # type: ignore - -# if xgboost is not None: -# GET_STATE_DISPATCH_FUNCTIONS.append((xgboost.sklearn.XGBModel, xgboost_get_state)) -# NODE_TYPE_MAPPING["XGBoostNode"] = XGBoostNode diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 6edee731..fddd6f50 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -114,7 +114,7 @@ def wrapper(state, load_context, trusted): return wrapper - modules = ["._general", "._numpy", "._scipy", "._sklearn", "._xgboost"] + modules = ["._general", "._numpy", "._scipy", "._sklearn"] for module_name in modules: # overwrite exposed functions for get_state and get_tree module = importlib.import_module(module_name, package="skops.io") @@ -876,3 +876,16 @@ def test_estimator_with_bytes(): est = EstimatorWithBytes().fit(None, None) loaded = loads(dumps(est), trusted=True) assert_params_equal(est.__dict__, loaded.__dict__) + + +def test_estimator_with_bytes_files_created(tmp_path): + est = EstimatorWithBytes().fit(None, None) + f_name = tmp_path / "estimator.skops" + dump(est, f_name) + file = Path(f_name) + assert file.exists() + + with ZipFile(f_name, "r") as input_zip: + files = input_zip.namelist() + bin_files = [file for file in files if file.endswith(".bin")] + assert len(bin_files) == 2 From 3910a5429686ec4de3627280616026748c8fdb5e Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 13 Dec 2022 12:57:04 +0100 Subject: [PATCH 09/12] Add CatBoost to complete the holy trinity --- docs/changes.rst | 2 +- docs/persistence.rst | 4 +- skops/_min_dependencies.py | 1 + skops/io/tests/test_external.py | 73 +++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 2 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index d1d01360..b47da03f 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -17,7 +17,7 @@ v0.4 - All `scikit-learn` estimators are trusted by default. :pr:`237` by :user:`Edoardo Abati `. - Persistence now supports bytes and bytearrays, added tests to verify that - LightGBM and XGBoost work. :pr:`244` by `Benjamin Bossan`_. + LightGBM, XGBoost, and CatBoost work now. :pr:`244` by `Benjamin Bossan`_. v0.3 ---- diff --git a/docs/persistence.rst b/docs/persistence.rst index 7bc65516..19959f85 100644 --- a/docs/persistence.rst +++ b/docs/persistence.rst @@ -98,7 +98,9 @@ arrays, dtypes, random generators, and ufuncs. Apart from this core, we plan to support machine learning libraries commonly used be the community. So far, those are: -- `LightGBM `_ (scikit-learn API) +- `LightGBM `_ (scikit-learn API) +- `XGBoost `_ (scikit-learn API) +- `CatBoost `_ If you run into a problem using any of the mentioned libraries, this could mean there is a bug in skops. Please open an issue on `our issue tracker diff --git a/skops/_min_dependencies.py b/skops/_min_dependencies.py index 2065cb14..a60579c4 100644 --- a/skops/_min_dependencies.py +++ b/skops/_min_dependencies.py @@ -30,6 +30,7 @@ # required for persistence tests of external libraries "lightgbm": ("3", "tests", None), "xgboost": ("1.6", "tests", None), + "catboost": ("1.0", "tests", None), } diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index 4bfc0228..50c6d9b9 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -250,3 +250,76 @@ def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method): estimator.fit(X, y, group=group) check_estimator(estimator, trusted=trusted) check_method_outputs(estimator, X, trusted=trusted) + + +class TestCatboost: + """Tests for CatBoostClassifier, CatBoostRegressor, and CatBoostRanker""" + + # CatBoost data is a little different so that it works as categorical data + @pytest.fixture(scope="module") + def cb_clf_data(self, clf_data): + X, y = clf_data + X = (X - X.min()).astype(int) + return X, y + + @pytest.fixture(scope="module") + def cb_regr_data(self, regr_data): + X, y = regr_data + X = (X - X.min()).astype(int) + return X, y + + @pytest.fixture(scope="module") + def cb_rank_data(self, rank_data): + X, y, group = rank_data + X = (X - X.min()).astype(int) + group_id = sum([[i] * n for i, n in enumerate(group)], []) + return X, y, group_id + + @pytest.fixture(autouse=True) + def catboost(self): + catboost = pytest.importorskip("catboost") + return catboost + + @pytest.fixture + def trusted(self): + # TODO: adjust once more types are trusted by default + return [ + "builtins.bytes", + "numpy.float32", + "numpy.float64", + "catboost.core.CatBoostClassifier", + "catboost.core.CatBoostRegressor", + "catboost.core.CatBoostRanker", + ] + + boosting_types = ["Ordered", "Plain"] + + @pytest.mark.parametrize("boosting_type", boosting_types) + def test_classifier(self, catboost, cb_clf_data, trusted, boosting_type): + estimator = catboost.CatBoostClassifier(boosting_type=boosting_type) + check_estimator(estimator, trusted=trusted) + + X, y = cb_clf_data + estimator.fit(X, y, cat_features=[0, 1]) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + @pytest.mark.parametrize("boosting_type", boosting_types) + def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type): + estimator = catboost.CatBoostRegressor(boosting_type=boosting_type) + check_estimator(estimator, trusted=trusted) + + X, y = cb_regr_data + estimator.fit(X, y, cat_features=[0, 1]) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) + + @pytest.mark.parametrize("boosting_type", boosting_types) + def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type): + estimator = catboost.CatBoostRanker(boosting_type=boosting_type) + check_estimator(estimator, trusted=trusted) + + X, y, group_id = cb_rank_data + estimator.fit(X, y, cat_features=[0, 1], group_id=group_id) + check_estimator(estimator, trusted=trusted) + check_method_outputs(estimator, X, trusted=trusted) From 4c73b80fc87a539949fc991807813a0c15f451ff Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Wed, 14 Dec 2022 11:19:23 +0100 Subject: [PATCH 10/12] Address reviewer comments - Refactor method output testing to be utility function - Add bytes and bytearray to trusted types --- skops/io/_general.py | 11 +++- skops/io/tests/_utils.py | 25 +++++++ skops/io/tests/test_external.py | 111 ++++++++++++++------------------ skops/io/tests/test_persist.py | 30 +-------- 4 files changed, 88 insertions(+), 89 deletions(-) diff --git a/skops/io/_general.py b/skops/io/_general.py index 77bd99cf..10ef9a0f 100644 --- a/skops/io/_general.py +++ b/skops/io/_general.py @@ -503,7 +503,7 @@ def __init__( trusted: bool | Sequence[str] = False, ) -> None: super().__init__(state, load_context, trusted) - self.trusted = self._get_trusted(trusted, []) + self.trusted = self._get_trusted(trusted, [bytes]) self.children = {"content": io.BytesIO(load_context.src.read(state["file"]))} def _construct(self): @@ -512,6 +512,15 @@ def _construct(self): class BytearrayNode(BytesNode): + def __init__( + self, + state: dict[str, Any], + load_context: LoadContext, + trusted: bool | Sequence[str] = False, + ) -> None: + super().__init__(state, load_context, trusted) + self.trusted = self._get_trusted(trusted, [bytearray]) + def _construct(self): content_bytes = super()._construct() content_bytearray = bytearray(list(content_bytes)) diff --git a/skops/io/tests/_utils.py b/skops/io/tests/_utils.py index fb870122..ead14b29 100644 --- a/skops/io/tests/_utils.py +++ b/skops/io/tests/_utils.py @@ -1,8 +1,14 @@ +import sys import warnings import numpy as np from scipy import sparse from sklearn.base import BaseEstimator +from sklearn.utils._testing import assert_allclose_dense_sparse + +# TODO: Investigate why that seems to be an issue on MacOS (only observed with +# Python 3.8) +ATOL = 1e-6 if sys.platform == "darwin" else 1e-7 def _is_steps_like(obj): @@ -143,3 +149,22 @@ def assert_params_equal(params1, params2): assert_params_equal(val1, val2) else: _assert_vals_equal(val1, val2) + + +def assert_method_outputs_equal(estimator, loaded, X): + # helper function that checks the output of all supported methods + for method in [ + "predict", + "predict_proba", + "decision_function", + "transform", + "predict_log_proba", + ]: + err_msg = ( + f"{estimator.__class__.__name__}.{method}() doesn't produce the same" + " results after loading the persisted model." + ) + if hasattr(estimator, method): + X_out1 = getattr(estimator, method)(X) + X_out2 = getattr(loaded, method)(X) + assert_allclose_dense_sparse(X_out1, X_out2, err_msg=err_msg, atol=ATOL) diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index 50c6d9b9..6e9d5df8 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -1,13 +1,15 @@ -import sys +"""Test persistence of "external" packages + +Packages that are not builtins, standard lib, numpy, scipy, or scikit-learn. + +""" import pytest from sklearn.datasets import make_classification, make_regression -from sklearn.utils._testing import assert_allclose_dense_sparse from skops.io import dumps, loads -from skops.io.tests._utils import assert_params_equal +from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal -ATOL = 1e-6 if sys.platform == "darwin" else 1e-7 # Default settings for generated data N_SAMPLES = 30 N_FEATURES = 10 @@ -44,30 +46,6 @@ def rank_data(clf_data): return X, y, group -def check_estimator(estimator, trusted): - loaded = loads(dumps(estimator), trusted=trusted) - assert_params_equal(estimator.get_params(), loaded.get_params()) - - -def check_method_outputs(estimator, X, trusted): - loaded = loads(dumps(estimator), trusted=trusted) - for method in [ - "predict", - "predict_proba", - "decision_function", - "transform", - "predict_log_proba", - ]: - err_msg = ( - f"{estimator.__class__.__name__}.{method}() doesn't produce the same" - " results after loading the persisted model." - ) - if hasattr(estimator, method): - X_out1 = getattr(estimator, method)(X) - X_out2 = getattr(loaded, method)(X) - assert_allclose_dense_sparse(X_out1, X_out2, err_msg=err_msg, atol=ATOL) - - class TestLightGBM: """Tests for LGBMClassifier, LGBMRegressor, LGBMRanker""" @@ -100,12 +78,13 @@ def test_classifier(self, lgbm, clf_data, trusted, boosting_type): kw["bagging_freq"] = 2 estimator = lgbm.LGBMClassifier(boosting_type=boosting_type, **kw) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y = clf_data estimator.fit(X, y) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize("boosting_type", boosting_types) def test_regressor(self, lgbm, regr_data, trusted, boosting_type): @@ -115,12 +94,13 @@ def test_regressor(self, lgbm, regr_data, trusted, boosting_type): kw["bagging_freq"] = 2 estimator = lgbm.LGBMRegressor(boosting_type=boosting_type, **kw) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y = regr_data estimator.fit(X, y) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize("boosting_type", boosting_types) def test_ranker(self, lgbm, rank_data, trusted, boosting_type): @@ -130,12 +110,13 @@ def test_ranker(self, lgbm, rank_data, trusted, boosting_type): kw["bagging_freq"] = 2 estimator = lgbm.LGBMRanker(boosting_type=boosting_type, **kw) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y, group = rank_data estimator.fit(X, y, group=group) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) class TestXGBoost: @@ -184,12 +165,13 @@ def test_classifier(self, xgboost, clf_data, trusted, booster, tree_method): return estimator = xgboost.XGBClassifier(booster=booster, tree_method=tree_method) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y = clf_data estimator.fit(X, y) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) @@ -199,12 +181,13 @@ def test_regressor(self, xgboost, regr_data, trusted, booster, tree_method): return estimator = xgboost.XGBRegressor(booster=booster, tree_method=tree_method) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y = regr_data estimator.fit(X, y) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) @@ -214,12 +197,13 @@ def test_rf_classifier(self, xgboost, clf_data, trusted, booster, tree_method): return estimator = xgboost.XGBRFClassifier(booster=booster, tree_method=tree_method) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y = clf_data estimator.fit(X, y) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) @@ -229,12 +213,13 @@ def test_rf_regressor(self, xgboost, regr_data, trusted, booster, tree_method): return estimator = xgboost.XGBRFRegressor(booster=booster, tree_method=tree_method) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y = regr_data estimator.fit(X, y) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize("booster", boosters) @pytest.mark.parametrize("tree_method", tree_methods) @@ -244,12 +229,13 @@ def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method): return estimator = xgboost.XGBRanker(booster=booster, tree_method=tree_method) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y, group = rank_data estimator.fit(X, y, group=group) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) class TestCatboost: @@ -297,29 +283,32 @@ def trusted(self): @pytest.mark.parametrize("boosting_type", boosting_types) def test_classifier(self, catboost, cb_clf_data, trusted, boosting_type): estimator = catboost.CatBoostClassifier(boosting_type=boosting_type) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y = cb_clf_data estimator.fit(X, y, cat_features=[0, 1]) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize("boosting_type", boosting_types) def test_regressor(self, catboost, cb_regr_data, trusted, boosting_type): estimator = catboost.CatBoostRegressor(boosting_type=boosting_type) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y = cb_regr_data estimator.fit(X, y, cat_features=[0, 1]) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize("boosting_type", boosting_types) def test_ranker(self, catboost, cb_rank_data, trusted, boosting_type): estimator = catboost.CatBoostRanker(boosting_type=boosting_type) - check_estimator(estimator, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_params_equal(estimator.get_params(), loaded.get_params()) X, y, group_id = cb_rank_data estimator.fit(X, y, cat_features=[0, 1], group_id=group_id) - check_estimator(estimator, trusted=trusted) - check_method_outputs(estimator, X, trusted=trusted) + loaded = loads(dumps(estimator), trusted=trusted) + assert_method_outputs_equal(estimator, loaded, X) diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index fddd6f50..51504562 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -2,7 +2,6 @@ import inspect import io import json -import sys import warnings from collections import Counter from functools import partial, wraps @@ -41,11 +40,7 @@ ) from sklearn.utils import all_estimators, check_random_state from sklearn.utils._tags import _safe_tags -from sklearn.utils._testing import ( - SkipTest, - assert_allclose_dense_sparse, - set_random_state, -) +from sklearn.utils._testing import SkipTest, set_random_state from sklearn.utils.estimator_checks import ( _construct_instance, _enforce_estimator_tags_y, @@ -59,16 +54,12 @@ from skops.io._trusted_types import SKLEARN_ESTIMATOR_TYPE_NAMES from skops.io._utils import LoadContext, SaveContext, _get_state, get_state from skops.io.exceptions import UnsupportedTypeException -from skops.io.tests._utils import assert_params_equal +from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal # Default settings for X N_SAMPLES = 50 N_FEATURES = 20 -# TODO: Investigate why that seems to be an issue on MacOS (only observed with -# Python 3.8) -ATOL = 1e-6 if sys.platform == "darwin" else 1e-7 - @pytest.fixture(autouse=True, scope="module") def debug_dispatch_functions(): @@ -354,22 +345,7 @@ def test_can_persist_fitted(estimator): assert_params_equal(estimator.__dict__, loaded.__dict__) assert not any(type_ in SKLEARN_ESTIMATOR_TYPE_NAMES for type_ in untrusted_types) - - for method in [ - "predict", - "predict_proba", - "decision_function", - "transform", - "predict_log_proba", - ]: - err_msg = ( - f"{estimator.__class__.__name__}.{method}() doesn't produce the same" - " results after loading the persisted model." - ) - if hasattr(estimator, method): - X_pred1 = getattr(estimator, method)(X) - X_pred2 = getattr(loaded, method)(X) - assert_allclose_dense_sparse(X_pred1, X_pred2, err_msg=err_msg, atol=ATOL) + assert_method_outputs_equal(estimator, loaded) @pytest.mark.parametrize( From 32adb818eaf97401ee7623116f6b937c19ad7aa1 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Wed, 14 Dec 2022 11:49:40 +0100 Subject: [PATCH 11/12] Fix missing argument in test function --- skops/io/tests/test_persist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 51504562..bd9c1e20 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -345,7 +345,7 @@ def test_can_persist_fitted(estimator): assert_params_equal(estimator.__dict__, loaded.__dict__) assert not any(type_ in SKLEARN_ESTIMATOR_TYPE_NAMES for type_ in untrusted_types) - assert_method_outputs_equal(estimator, loaded) + assert_method_outputs_equal(estimator, loaded, X) @pytest.mark.parametrize( From 62fb8f2440a117a99c18a3fdc7008bc8ba48d6b0 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Wed, 14 Dec 2022 12:59:51 +0100 Subject: [PATCH 12/12] Reviewer comment: improve comment Co-authored-by: Adrin Jalali --- skops/io/tests/test_external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index 6e9d5df8..fc58f5f4 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -132,7 +132,7 @@ class TestXGBoost: These discrepancies occur regardless of skops, so they're a problem in xgboost itself. We assume that this has no practical consequences and thus - avoid testing these cases. + avoid testing these cases. See https://github.com/dmlc/xgboost/issues/8596 """