From 2182e1c239c28af1dc4cb33e34154318f07fc71d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 18 Apr 2023 16:54:45 +0200 Subject: [PATCH 1/3] ENH expose compression and compression level to the user API --- skops/io/_persist.py | 58 +++++++++++++++++++++++----------- skops/io/tests/test_persist.py | 35 +++++++++++++++++++- 2 files changed, 73 insertions(+), 20 deletions(-) diff --git a/skops/io/_persist.py b/skops/io/_persist.py index 507b61d9..8d81f11a 100644 --- a/skops/io/_persist.py +++ b/skops/io/_persist.py @@ -5,7 +5,7 @@ import json from pathlib import Path from typing import Any, BinaryIO, Sequence -from zipfile import ZipFile +from zipfile import ZIP_STORED, ZipFile import skops @@ -26,10 +26,12 @@ NODE_TYPE_MAPPING.update(module.NODE_TYPE_MAPPING) -def _save(obj: Any) -> io.BytesIO: +def _save(obj: Any, compression: int, compresslevel: int | None) -> io.BytesIO: buffer = io.BytesIO() - with ZipFile(buffer, "w") as zip_file: + with ZipFile( + buffer, "w", compression=compression, compresslevel=compresslevel + ) as zip_file: save_context = SaveContext(zip_file=zip_file) state = get_state(obj, save_context) save_context.clear_memo() @@ -41,19 +43,19 @@ def _save(obj: Any) -> io.BytesIO: return buffer -def dump(obj: Any, file: str | Path | BinaryIO) -> None: +def dump( + obj: Any, + file: str | Path | BinaryIO, + *, + compression: int = ZIP_STORED, + compresslevel: int | None = None, +) -> None: """Save an object using the skops persistence format. Skops aims at providing a secure persistence feature that does not rely on :mod:`pickle`, which is inherently insecure. For more information, please visit the :ref:`persistence` documentation. - .. warning:: - - This feature is heavily under development, which means the API is - unstable and there might be security issues at the moment. Therefore, - use caution when loading files from sources you don't trust. - Parameters ---------- obj: object @@ -64,8 +66,19 @@ def dump(obj: Any, file: str | Path | BinaryIO) -> None: convention, we recommend to use the ".skops" file extension, e.g. ``save(model, "my-model.skops")``. + compression: int, default=zipfile.ZIP_STORED + The compression method to use. See :class:`zipfile.ZipFile` for more + information. + + .. versionadded:: 0.7 + + compresslevel: int, default=None + The compression level to use. See :class:`zipfile.ZipFile` for more + information. + + .. versionadded:: 0.7 """ - buffer = _save(obj) + buffer = _save(obj, compression=compression, compresslevel=compresslevel) if isinstance(file, (str, Path)): with open(file, "wb") as f: @@ -74,22 +87,29 @@ def dump(obj: Any, file: str | Path | BinaryIO) -> None: file.write(buffer.getbuffer()) -def dumps(obj: Any) -> bytes: +def dumps( + obj: Any, *, compression: int = ZIP_STORED, compresslevel: int | None = None +) -> bytes: """Save an object using the skops persistence format as a bytes object. - .. warning:: - - This feature is heavily under development, which means the API is - unstable and there might be security issues at the moment. Therefore, - use caution when loading files from sources you don't trust. - Parameters ---------- obj: object The object to be saved. Usually a scikit-learn compatible model. + compression: int, default=zipfile.ZIP_STORED + The compression method to use. See :class:`zipfile.ZipFile` for more + information. + + .. versionadded:: 0.7 + + compresslevel: int, default=None + The compression level to use. See :class:`zipfile.ZipFile` for more + information. + + .. versionadded:: 0.7 """ - buffer = _save(obj) + buffer = _save(obj, compression=compression, compresslevel=compresslevel) return buffer.getbuffer().tobytes() diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 9876c590..a4a9250e 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -8,7 +8,7 @@ from collections import Counter from functools import partial, wraps from pathlib import Path -from zipfile import ZipFile +from zipfile import ZIP_DEFLATED, ZipFile import joblib import numpy as np @@ -20,6 +20,7 @@ from sklearn.decomposition import SparseCoder from sklearn.exceptions import SkipTestWarning from sklearn.experimental import enable_halving_search_cv # noqa +from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import ( GridSearchCV, @@ -1002,3 +1003,35 @@ def test_persist_function(func): # check that loaded estimator is identical assert_params_equal(estimator.__dict__, loaded.__dict__) assert_method_outputs_equal(estimator, loaded, X) + + +def test_compression_level(): + # Test that setting the compression to zlib and specifying a + # compressionlevel reduces the dumped size. + text = """ + Stop words are words like “and”, “the”, “him”, which are presumed to be + uninformative in representing the content of a text, and which may be + removed to avoid them being construed as signal for prediction. Sometimes, + however, similar words are useful for prediction, such as in classifying + writing style or personality. + + There are several known issues in our provided ‘english’ stop word list. It + does not aim to be a general, ‘one-size-fits-all’ solution as some tasks + may require a more custom solution. See [NQY18] for more details. + + Please take care in choosing a stop word list. Popular stop word lists may + include words that are highly informative to some tasks, such as computer. + + You should also make sure that the stop word list has had the same + preprocessing and tokenization applied as the one used in the vectorizer. + The word we’ve is split into we and ve by CountVectorizer’s default + tokenizer, so if we’ve is in stop_words, but ve is not, ve will be retained + from we’ve in transformed text. Our vectorizers will try to identify and + warn about some kinds of inconsistencies. + """ + + model = TfidfVectorizer().fit([text]) + dumped_raw = dumps(model) + dumped_compressed = dumps(model, compression=ZIP_DEFLATED, compresslevel=9) + # This reduces the size substantially: 63465 -> 3917 + assert len(dumped_raw) > len(dumped_compressed) From ea61cf0d1dcfd8b43304b559e7496b7ba4dee693 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 18 Apr 2023 16:59:05 +0200 Subject: [PATCH 2/3] add changelog --- docs/changes.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/changes.rst b/docs/changes.rst index 11bb032b..295b876b 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -12,7 +12,9 @@ skops Changelog v0.7 ---- - +- `compression` and `compresslevel` from :class:`~zipfile.ZipFile` are now + exposed to the user via :func:`.io.dumps` and :func:`.io.dump`. :pr:`345` by + `Adrin Jalali`_. v0.6 ---- From 954f9aebfca4ffcc69bc0ee9a336c390dcec5a1a Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 20 Apr 2023 12:18:45 +0200 Subject: [PATCH 3/3] apply suggestions --- docs/persistence.rst | 16 ++++++++++++++++ skops/io/tests/test_persist.py | 28 +++------------------------- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/docs/persistence.rst b/docs/persistence.rst index 2ed2819e..4d7e9e84 100644 --- a/docs/persistence.rst +++ b/docs/persistence.rst @@ -110,6 +110,22 @@ you have custom functions (say, a custom function to be used with most ``numpy`` and ``scipy`` functions should work. Therefore, you can save objects having references to functions such as ``numpy.sqrt``. +Compression +~~~~~~~~~~~ + +If file size is an issue, you can compress the file by setting the +``compression`` and ``compresslevel`` arguments to :func:`skops.io.dump` and +:func:`skops.io.dumps`. For example, to compress the file using ``zlib`` with +level 9: + +.. code:: python + + from zipfile import ZIP_DEFLATED + dump(clf, "my-model.skops", compression=ZIP_DEFLATED, compresslevel=9) + +Check the documentation of these two arguments under :class:`zipfile.ZipFile` +for more details. + Command Line Interface ###################### diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index a4a9250e..e501abc1 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -1008,30 +1008,8 @@ def test_persist_function(func): def test_compression_level(): # Test that setting the compression to zlib and specifying a # compressionlevel reduces the dumped size. - text = """ - Stop words are words like “and”, “the”, “him”, which are presumed to be - uninformative in representing the content of a text, and which may be - removed to avoid them being construed as signal for prediction. Sometimes, - however, similar words are useful for prediction, such as in classifying - writing style or personality. - - There are several known issues in our provided ‘english’ stop word list. It - does not aim to be a general, ‘one-size-fits-all’ solution as some tasks - may require a more custom solution. See [NQY18] for more details. - - Please take care in choosing a stop word list. Popular stop word lists may - include words that are highly informative to some tasks, such as computer. - - You should also make sure that the stop word list has had the same - preprocessing and tokenization applied as the one used in the vectorizer. - The word we’ve is split into we and ve by CountVectorizer’s default - tokenizer, so if we’ve is in stop_words, but ve is not, ve will be retained - from we’ve in transformed text. Our vectorizers will try to identify and - warn about some kinds of inconsistencies. - """ - - model = TfidfVectorizer().fit([text]) + model = TfidfVectorizer().fit([np.__doc__]) dumped_raw = dumps(model) dumped_compressed = dumps(model, compression=ZIP_DEFLATED, compresslevel=9) - # This reduces the size substantially: 63465 -> 3917 - assert len(dumped_raw) > len(dumped_compressed) + # This reduces the size substantially + assert len(dumped_raw) > 5 * len(dumped_compressed)