From 2182e1c239c28af1dc4cb33e34154318f07fc71d Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 18 Apr 2023 16:54:45 +0200
Subject: [PATCH 1/3] ENH expose compression and compression level to the user
 API

---
 skops/io/_persist.py           | 58 +++++++++++++++++++++++-----------
 skops/io/tests/test_persist.py | 35 +++++++++++++++++++-
 2 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/skops/io/_persist.py b/skops/io/_persist.py
index 507b61d9..8d81f11a 100644
--- a/skops/io/_persist.py
+++ b/skops/io/_persist.py
@@ -5,7 +5,7 @@
 import json
 from pathlib import Path
 from typing import Any, BinaryIO, Sequence
-from zipfile import ZipFile
+from zipfile import ZIP_STORED, ZipFile
 
 import skops
 
@@ -26,10 +26,12 @@
     NODE_TYPE_MAPPING.update(module.NODE_TYPE_MAPPING)
 
 
-def _save(obj: Any) -> io.BytesIO:
+def _save(obj: Any, compression: int, compresslevel: int | None) -> io.BytesIO:
     buffer = io.BytesIO()
 
-    with ZipFile(buffer, "w") as zip_file:
+    with ZipFile(
+        buffer, "w", compression=compression, compresslevel=compresslevel
+    ) as zip_file:
         save_context = SaveContext(zip_file=zip_file)
         state = get_state(obj, save_context)
         save_context.clear_memo()
@@ -41,19 +43,19 @@ def _save(obj: Any) -> io.BytesIO:
     return buffer
 
 
-def dump(obj: Any, file: str | Path | BinaryIO) -> None:
+def dump(
+    obj: Any,
+    file: str | Path | BinaryIO,
+    *,
+    compression: int = ZIP_STORED,
+    compresslevel: int | None = None,
+) -> None:
     """Save an object using the skops persistence format.
 
     Skops aims at providing a secure persistence feature that does not rely on
     :mod:`pickle`, which is inherently insecure. For more information, please
     visit the :ref:`persistence` documentation.
 
-    .. warning::
-
-        This feature is heavily under development, which means the API is
-        unstable and there might be security issues at the moment. Therefore,
-        use caution when loading files from sources you don't trust.
-
     Parameters
     ----------
     obj: object
@@ -64,8 +66,19 @@ def dump(obj: Any, file: str | Path | BinaryIO) -> None:
         convention, we recommend to use the ".skops" file extension, e.g.
         ``save(model, "my-model.skops")``.
 
+    compression: int, default=zipfile.ZIP_STORED
+        The compression method to use. See :class:`zipfile.ZipFile` for more
+        information.
+
+        .. versionadded:: 0.7
+
+    compresslevel: int, default=None
+        The compression level to use. See :class:`zipfile.ZipFile` for more
+        information.
+
+        .. versionadded:: 0.7
     """
-    buffer = _save(obj)
+    buffer = _save(obj, compression=compression, compresslevel=compresslevel)
 
     if isinstance(file, (str, Path)):
         with open(file, "wb") as f:
@@ -74,22 +87,29 @@ def dump(obj: Any, file: str | Path | BinaryIO) -> None:
         file.write(buffer.getbuffer())
 
 
-def dumps(obj: Any) -> bytes:
+def dumps(
+    obj: Any, *, compression: int = ZIP_STORED, compresslevel: int | None = None
+) -> bytes:
     """Save an object using the skops persistence format as a bytes object.
 
-    .. warning::
-
-        This feature is heavily under development, which means the API is
-        unstable and there might be security issues at the moment. Therefore,
-        use caution when loading files from sources you don't trust.
-
     Parameters
     ----------
     obj: object
         The object to be saved. Usually a scikit-learn compatible model.
 
+    compression: int, default=zipfile.ZIP_STORED
+        The compression method to use. See :class:`zipfile.ZipFile` for more
+        information.
+
+        .. versionadded:: 0.7
+
+    compresslevel: int, default=None
+        The compression level to use. See :class:`zipfile.ZipFile` for more
+        information.
+
+        .. versionadded:: 0.7
     """
-    buffer = _save(obj)
+    buffer = _save(obj, compression=compression, compresslevel=compresslevel)
     return buffer.getbuffer().tobytes()
 
 
diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py
index 9876c590..a4a9250e 100644
--- a/skops/io/tests/test_persist.py
+++ b/skops/io/tests/test_persist.py
@@ -8,7 +8,7 @@
 from collections import Counter
 from functools import partial, wraps
 from pathlib import Path
-from zipfile import ZipFile
+from zipfile import ZIP_DEFLATED, ZipFile
 
 import joblib
 import numpy as np
@@ -20,6 +20,7 @@
 from sklearn.decomposition import SparseCoder
 from sklearn.exceptions import SkipTestWarning
 from sklearn.experimental import enable_halving_search_cv  # noqa
+from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import (
     GridSearchCV,
@@ -1002,3 +1003,35 @@ def test_persist_function(func):
     # check that loaded estimator is identical
     assert_params_equal(estimator.__dict__, loaded.__dict__)
     assert_method_outputs_equal(estimator, loaded, X)
+
+
+def test_compression_level():
+    # Test that setting the compression to zlib and specifying a
+    # compressionlevel reduces the dumped size.
+    text = """
+    Stop words are words like “and”, “the”, “him”, which are presumed to be
+    uninformative in representing the content of a text, and which may be
+    removed to avoid them being construed as signal for prediction. Sometimes,
+    however, similar words are useful for prediction, such as in classifying
+    writing style or personality.
+
+    There are several known issues in our provided ‘english’ stop word list. It
+    does not aim to be a general, ‘one-size-fits-all’ solution as some tasks
+    may require a more custom solution. See [NQY18] for more details.
+
+    Please take care in choosing a stop word list. Popular stop word lists may
+    include words that are highly informative to some tasks, such as computer.
+
+    You should also make sure that the stop word list has had the same
+    preprocessing and tokenization applied as the one used in the vectorizer.
+    The word we’ve is split into we and ve by CountVectorizer’s default
+    tokenizer, so if we’ve is in stop_words, but ve is not, ve will be retained
+    from we’ve in transformed text. Our vectorizers will try to identify and
+    warn about some kinds of inconsistencies.
+    """
+
+    model = TfidfVectorizer().fit([text])
+    dumped_raw = dumps(model)
+    dumped_compressed = dumps(model, compression=ZIP_DEFLATED, compresslevel=9)
+    # This reduces the size substantially: 63465 -> 3917
+    assert len(dumped_raw) > len(dumped_compressed)

From ea61cf0d1dcfd8b43304b559e7496b7ba4dee693 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 18 Apr 2023 16:59:05 +0200
Subject: [PATCH 2/3] add changelog

---
 docs/changes.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/changes.rst b/docs/changes.rst
index 11bb032b..295b876b 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -12,7 +12,9 @@ skops Changelog
 
 v0.7
 ----
-
+- `compression` and `compresslevel` from :class:`~zipfile.ZipFile` are now
+  exposed to the user via :func:`.io.dumps` and :func:`.io.dump`. :pr:`345` by
+  `Adrin Jalali`_.
 
 v0.6
 ----

From 954f9aebfca4ffcc69bc0ee9a336c390dcec5a1a Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 20 Apr 2023 12:18:45 +0200
Subject: [PATCH 3/3] apply suggestions

---
 docs/persistence.rst           | 16 ++++++++++++++++
 skops/io/tests/test_persist.py | 28 +++-------------------------
 2 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/docs/persistence.rst b/docs/persistence.rst
index 2ed2819e..4d7e9e84 100644
--- a/docs/persistence.rst
+++ b/docs/persistence.rst
@@ -110,6 +110,22 @@ you have custom functions (say, a custom function to be used with
 most ``numpy`` and ``scipy`` functions should work. Therefore, you can save
 objects having references to functions such as ``numpy.sqrt``.
 
+Compression
+~~~~~~~~~~~
+
+If file size is an issue, you can compress the file by setting the
+``compression`` and ``compresslevel`` arguments to :func:`skops.io.dump` and
+:func:`skops.io.dumps`. For example, to compress the file using ``zlib`` with
+level 9:
+
+.. code:: python
+
+    from zipfile import ZIP_DEFLATED
+    dump(clf, "my-model.skops", compression=ZIP_DEFLATED, compresslevel=9)
+
+Check the documentation of these two arguments under :class:`zipfile.ZipFile`
+for more details.
+
 Command Line Interface
 ######################
 
diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py
index a4a9250e..e501abc1 100644
--- a/skops/io/tests/test_persist.py
+++ b/skops/io/tests/test_persist.py
@@ -1008,30 +1008,8 @@ def test_persist_function(func):
 def test_compression_level():
     # Test that setting the compression to zlib and specifying a
     # compressionlevel reduces the dumped size.
-    text = """
-    Stop words are words like “and”, “the”, “him”, which are presumed to be
-    uninformative in representing the content of a text, and which may be
-    removed to avoid them being construed as signal for prediction. Sometimes,
-    however, similar words are useful for prediction, such as in classifying
-    writing style or personality.
-
-    There are several known issues in our provided ‘english’ stop word list. It
-    does not aim to be a general, ‘one-size-fits-all’ solution as some tasks
-    may require a more custom solution. See [NQY18] for more details.
-
-    Please take care in choosing a stop word list. Popular stop word lists may
-    include words that are highly informative to some tasks, such as computer.
-
-    You should also make sure that the stop word list has had the same
-    preprocessing and tokenization applied as the one used in the vectorizer.
-    The word we’ve is split into we and ve by CountVectorizer’s default
-    tokenizer, so if we’ve is in stop_words, but ve is not, ve will be retained
-    from we’ve in transformed text. Our vectorizers will try to identify and
-    warn about some kinds of inconsistencies.
-    """
-
-    model = TfidfVectorizer().fit([text])
+    model = TfidfVectorizer().fit([np.__doc__])
     dumped_raw = dumps(model)
     dumped_compressed = dumps(model, compression=ZIP_DEFLATED, compresslevel=9)
-    # This reduces the size substantially: 63465 -> 3917
-    assert len(dumped_raw) > len(dumped_compressed)
+    # This reduces the size substantially
+    assert len(dumped_raw) > 5 * len(dumped_compressed)