From cf1b7aa7af1e0abc98bc409d6a7b79150aa5149e Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Fri, 21 Apr 2023 17:17:01 +0200
Subject: [PATCH 1/3] Run check for file size difference skops vs pickle

We already measure the runtime performance difference, but we don't
check the file size difference. This PR adds another, similar script
that does exactly that. The checks are run in the same workflow as the
runtime performance checks.

The results are reported, showing top 10 largest differences, once in
terms of absolute, once in terms of relative differences. In contrast to
the runtime performance check, there is never any error raised, no
matter how big the difference is, because it is unclear what, if any,
difference, would count as unacceptable.

For skops, the zip file is highly compressed. This is a reasonable
choice for the benchmark because we can assume that if file size is a
concern, users would choose that option.
---
 .github/workflows/persistence-performance.yml |   4 +-
 scripts/check_file_size.py                    | 112 ++++++++++++++++++
 2 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 scripts/check_file_size.py

diff --git a/.github/workflows/persistence-performance.yml b/.github/workflows/persistence-performance.yml
index 0d82f6c9..be999636 100644
--- a/.github/workflows/persistence-performance.yml
+++ b/.github/workflows/persistence-performance.yml
@@ -1,4 +1,4 @@
-name: Test performance of skops persistence
+name: Test performance and file size of skops persistence
 
 on:
   schedule:
@@ -26,3 +26,5 @@ jobs:
         pip list
     - name: Run persistence performance checks
       run: python scripts/check_persistence_performance.py
+    - name: Run file size checks
+      run: python scripts/check_file_size.py
diff --git a/scripts/check_file_size.py b/scripts/check_file_size.py
new file mode 100644
index 00000000..946f2748
--- /dev/null
+++ b/scripts/check_file_size.py
@@ -0,0 +1,112 @@
+"""Check that the file size of skops files is not too large.
+
+Load each (fitted) estimator and persist it with pickle and with skops. Measure
+the file size of the resulting files. Report the results but in contrast to the
+runtime check, don't raise any errors if the file size differences is too big.
+
+For skops, zip compression is applied. This is because we can assume that if a
+user really cares about file size, they will compress the file.
+
+"""
+
+from __future__ import annotations
+
+import os
+import pickle
+import warnings
+from tempfile import mkstemp
+from typing import Any
+from zipfile import ZIP_DEFLATED
+
+import pandas as pd
+from sklearn.utils._tags import _safe_tags
+from sklearn.utils._testing import set_random_state
+
+import skops.io as sio
+from skops.io.tests.test_persist import (
+    _get_check_estimator_ids,
+    _tested_estimators,
+    get_input,
+)
+
+TOPK = 10  # number of largest estimators reported
+
+
+def check_file_size() -> None:
+    """Run all file size checks on all estimators and report the results.
+
+    Print the results twice, once sorted by absolute differences, once sorted by
+    relative differences.
+
+    """
+    results: dict[str, list[Any]] = {"name": [], "pickle (kb)": [], "skops (kb)": []}
+    for estimator in _tested_estimators():
+        set_random_state(estimator, random_state=0)
+
+        X, y = get_input(estimator)
+        tags = _safe_tags(estimator)
+        if tags.get("requires_fit", True):
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", module="sklearn")
+                if y is not None:
+                    estimator.fit(X, y)
+                else:
+                    estimator.fit(X)
+
+        name = _get_check_estimator_ids(estimator)
+        cls_name, _, _ = name.partition("(")
+        size_pickle, size_skops = run_check(estimator)
+
+        results["name"].append(cls_name)
+        results["pickle (kb)"].append(size_pickle)
+        results["skops (kb)"].append(size_skops)
+
+    format_result(results, topk=TOPK)
+
+
+def run_check(estimator) -> tuple[float, float]:
+    """Run file size check with the given estimator for pickle and skops."""
+    _, name = mkstemp(prefix="skops")
+
+    def run_pickle():
+        fname = name + ".pickle"
+        with open(fname, "wb") as f:
+            pickle.dump(estimator, f)
+        # return size in kb
+        return os.stat(fname).st_size / 1024
+
+    def run_skops():
+        fname = name + ".skops"
+        sio.dump(estimator, fname, compression=ZIP_DEFLATED, compresslevel=9)
+        # return size in kb
+        return os.stat(fname).st_size / 1024
+
+    size_pickle = run_pickle()
+    size_skops = run_skops()
+    return size_pickle, size_skops
+
+
+def format_result(results: dict[str, list[Any]], topk: int) -> None:
+    """Report results from performance checks.
+
+    Print the largest file size differences between pickle and skops, once for
+    absolute, once for relative differences.
+
+    """
+    df = pd.DataFrame(results)
+    df = df.assign(
+        abs_diff=df["skops (kb)"] - df["pickle (kb)"],
+        rel_diff=df["skops (kb)"] / df["pickle (kb)"],
+    )
+
+    dfs = df.sort_values(["abs_diff"], ascending=False).reset_index(drop=True)
+    print(f"{topk} largest absolute differences:")
+    print(dfs[["name", "pickle (kb)", "skops (kb)", "abs_diff"]].head(10))
+
+    print(f"{topk} largest relative differences:")
+    dfs = df.sort_values(["rel_diff"], ascending=False).reset_index(drop=True)
+    print(dfs[["name", "pickle (kb)", "skops (kb)", "rel_diff"]].head(10))
+
+
+if __name__ == "__main__":
+    check_file_size()

From 0ad4b23e9415fc2fbf4454218d88499d34c6e49f Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Fri, 21 Apr 2023 17:36:18 +0200
Subject: [PATCH 2/3] For fairness, also compress the pickle files

---
 scripts/check_file_size.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/check_file_size.py b/scripts/check_file_size.py
index 946f2748..a1d91997 100644
--- a/scripts/check_file_size.py
+++ b/scripts/check_file_size.py
@@ -11,12 +11,13 @@
 
 from __future__ import annotations
 
+import io
 import os
 import pickle
 import warnings
 from tempfile import mkstemp
 from typing import Any
-from zipfile import ZIP_DEFLATED
+from zipfile import ZIP_DEFLATED, ZipFile
 
 import pandas as pd
 from sklearn.utils._tags import _safe_tags
@@ -70,10 +71,15 @@ def run_check(estimator) -> tuple[float, float]:
 
     def run_pickle():
         fname = name + ".pickle"
-        with open(fname, "wb") as f:
-            pickle.dump(estimator, f)
+        buffer = io.BytesIO()
+        pickle.dump(estimator, buffer)
+        with ZipFile(
+            fname + ".zip", mode="w", compression=ZIP_DEFLATED, compresslevel=9
+        ) as zipf:
+            zipf.writestr(fname, buffer.getvalue())
+
         # return size in kb
-        return os.stat(fname).st_size / 1024
+        return os.stat(fname + ".zip").st_size / 1024
 
     def run_skops():
         fname = name + ".skops"

From fa7fd1446ee1a9f50a1ef09943dee93c2dc42ff8 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Tue, 25 Apr 2023 11:11:12 +0200
Subject: [PATCH 3/3] Raise error if any skops file exceeds 1 mb

---
 scripts/check_file_size.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/scripts/check_file_size.py b/scripts/check_file_size.py
index a1d91997..c88bf11e 100644
--- a/scripts/check_file_size.py
+++ b/scripts/check_file_size.py
@@ -1,11 +1,11 @@
 """Check that the file size of skops files is not too large.
 
 Load each (fitted) estimator and persist it with pickle and with skops. Measure
-the file size of the resulting files. Report the results but in contrast to the
-runtime check, don't raise any errors if the file size differences is too big.
+the file size of the resulting files. Report the results and raise an error if
+any file is larger than MAX_ALLOWED_SIZE.
 
-For skops, zip compression is applied. This is because we can assume that if a
-user really cares about file size, they will compress the file.
+Zip compression is applied. This is because we can assume that if a user really
+cares about file size, they will compress the file.
 
 """
 
@@ -31,6 +31,7 @@
 )
 
 TOPK = 10  # number of largest estimators reported
+MAX_ALLOWED_SIZE = 1024  # maximum allowed file size in kb
 
 
 def check_file_size() -> None:
@@ -113,6 +114,15 @@ def format_result(results: dict[str, list[Any]], topk: int) -> None:
     dfs = df.sort_values(["rel_diff"], ascending=False).reset_index(drop=True)
     print(dfs[["name", "pickle (kb)", "skops (kb)", "rel_diff"]].head(10))
 
+    df_large = df[df["skops (kb)"] > MAX_ALLOWED_SIZE]
+    if df_large.empty:
+        print("No file was found to be unacceptably large.")
+        return
+
+    print(f"Found {len(df_large)} skops file(s) larger than {MAX_ALLOWED_SIZE} kb:")
+    print(", ".join(df_large["name"].tolist()))
+    raise RuntimeError("Found unacceptably large skops files.")
+
 
 if __name__ == "__main__":
     check_file_size()