skops-dev · adrinjalali · Apr 25, 2023 · Apr 21, 2023 · Apr 21, 2023 · Apr 25, 2023
diff --git a/.github/workflows/persistence-performance.yml b/.github/workflows/persistence-performance.yml
@@ -1,4 +1,4 @@
-name: Test performance of skops persistence
+name: Test performance and file size of skops persistence
 
 on:
   schedule:
@@ -26,3 +26,5 @@ jobs:
         pip list
     - name: Run persistence performance checks
       run: python scripts/check_persistence_performance.py
+    - name: Run file size checks
+      run: python scripts/check_file_size.py
diff --git a/scripts/check_file_size.py b/scripts/check_file_size.py
@@ -0,0 +1,128 @@
+"""Check that the file size of skops files is not too large.
+
+Load each (fitted) estimator and persist it with pickle and with skops. Measure
+the file size of the resulting files. Report the results and raise an error if
+any file is larger than MAX_ALLOWED_SIZE.
+
+Zip compression is applied. This is because we can assume that if a user really
+cares about file size, they will compress the file.
+
+"""
+
+from __future__ import annotations
+
+import io
+import os
+import pickle
+import warnings
+from tempfile import mkstemp
+from typing import Any
+from zipfile import ZIP_DEFLATED, ZipFile
+
+import pandas as pd
+from sklearn.utils._tags import _safe_tags
+from sklearn.utils._testing import set_random_state
+
+import skops.io as sio
+from skops.io.tests.test_persist import (
+    _get_check_estimator_ids,
+    _tested_estimators,
+    get_input,
+)
+
+TOPK = 10  # number of largest estimators reported
+MAX_ALLOWED_SIZE = 1024  # maximum allowed file size in kb
+
+
+def check_file_size() -> None:
+    """Run all file size checks on all estimators and report the results.
+
+    Print the results twice, once sorted by absolute differences, once sorted by
+    relative differences.
+
+    """
+    results: dict[str, list[Any]] = {"name": [], "pickle (kb)": [], "skops (kb)": []}
+    for estimator in _tested_estimators():
+        set_random_state(estimator, random_state=0)
+
+        X, y = get_input(estimator)
+        tags = _safe_tags(estimator)
+        if tags.get("requires_fit", True):
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", module="sklearn")
+                if y is not None:
+                    estimator.fit(X, y)
+                else:
+                    estimator.fit(X)
+
+        name = _get_check_estimator_ids(estimator)
+        cls_name, _, _ = name.partition("(")
+        size_pickle, size_skops = run_check(estimator)
+
+        results["name"].append(cls_name)
+        results["pickle (kb)"].append(size_pickle)
+        results["skops (kb)"].append(size_skops)
+
+    format_result(results, topk=TOPK)
+
+
+def run_check(estimator) -> tuple[float, float]:
+    """Run file size check with the given estimator for pickle and skops."""
+    _, name = mkstemp(prefix="skops")
+
+    def run_pickle():
+        fname = name + ".pickle"
+        buffer = io.BytesIO()
+        pickle.dump(estimator, buffer)
+        with ZipFile(
+            fname + ".zip", mode="w", compression=ZIP_DEFLATED, compresslevel=9
+        ) as zipf:
+            zipf.writestr(fname, buffer.getvalue())
+
+        # return size in kb
+        return os.stat(fname + ".zip").st_size / 1024
+
+    def run_skops():
+        fname = name + ".skops"
+        sio.dump(estimator, fname, compression=ZIP_DEFLATED, compresslevel=9)
+        # return size in kb
+        return os.stat(fname).st_size / 1024
+
+    size_pickle = run_pickle()
+    size_skops = run_skops()
+    return size_pickle, size_skops
+
+
+def format_result(results: dict[str, list[Any]], topk: int) -> None:
+    """Report results from performance checks.
+
+    Print the largest file size differences between pickle and skops, once for
+    absolute, once for relative differences.
+
+    """
+    df = pd.DataFrame(results)
+    df = df.assign(
+        abs_diff=df["skops (kb)"] - df["pickle (kb)"],
+        rel_diff=df["skops (kb)"] / df["pickle (kb)"],
+    )
+
+    dfs = df.sort_values(["abs_diff"], ascending=False).reset_index(drop=True)
+    print(f"{topk} largest absolute differences:")
+    print(dfs[["name", "pickle (kb)", "skops (kb)", "abs_diff"]].head(10))
+
+    print(f"{topk} largest relative differences:")
+    dfs = df.sort_values(["rel_diff"], ascending=False).reset_index(drop=True)
+    print(dfs[["name", "pickle (kb)", "skops (kb)", "rel_diff"]].head(10))
+
+    df_large = df[df["skops (kb)"] > MAX_ALLOWED_SIZE]
+    if df_large.empty:
+        print("No file was found to be unacceptably large.")
+        return
+
+    print(f"Found {len(df_large)} skops file(s) larger than {MAX_ALLOWED_SIZE} kb:")
+    print(", ".join(df_large["name"].tolist()))
+    raise RuntimeError("Found unacceptably large skops files.")
+
+
+if __name__ == "__main__":
+    check_file_size()