From d8dec8c4d264d708ec8e758be8fe7a42a135dda8 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 2 Feb 2023 15:45:32 +0100 Subject: [PATCH 1/3] Add performance test + GH action to run it weekly --- .github/workflows/persistence-performance.yml | 27 ++++++ scripts/check_persistence_performance.py | 94 +++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 .github/workflows/persistence-performance.yml create mode 100644 scripts/check_persistence_performance.py diff --git a/.github/workflows/persistence-performance.yml b/.github/workflows/persistence-performance.yml new file mode 100644 index 00000000..5c7e97d1 --- /dev/null +++ b/.github/workflows/persistence-performance.yml @@ -0,0 +1,27 @@ +name: HF integration tests + +on: + schedule: + - cron: '0 9 * * 0' # every sunday at 9:00 UTC + workflow_dispatch: + +jobs: + check-persistence-performance: + + runs-on: ubuntu-latest + # if: "github.repository == 'skops-dev/skops'" + if: "github.repository == 'BenjaminBossan/skops'" + + timeout-minutes: 10 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + - name: Install requirements + run: | + pip install .[tests] + pip --version + pip list + - name: Run persistence performance checks + run: python scripts/check_persistence_performance.py diff --git a/scripts/check_persistence_performance.py b/scripts/check_persistence_performance.py new file mode 100644 index 00000000..13300205 --- /dev/null +++ b/scripts/check_persistence_performance.py @@ -0,0 +1,94 @@ +"""Check that the performance of skops persistence is not too slow + +Load each (fitted) estimator and persist it with pickle and with skops. Measure +the time it takes and record it. Report the estimators that were slowest. If +skops is much slower than pickle (in absolute terms), raise an error to make the +GH action fail. + +""" + +import pickle +import timeit +import warnings + +import pandas as pd +from sklearn.utils._tags import _safe_tags +from sklearn.utils._testing import set_random_state + +import skops.io as sio +from skops.io.tests.test_persist import ( + _get_check_estimator_ids, + _tested_estimators, + get_input, +) + +ATOL = 1 # seconds absolute difference allowed at max +NUM_REPS = 10 # number of times the check is repeated +TOPK = 10 # number of slowest estimators reported + + +def check_persist_performance(): + results = {"name": [], "pickle (s)": [], "skops (s)": []} + for estimator in _tested_estimators(): + set_random_state(estimator, random_state=0) + + X, y = get_input(estimator) + tags = _safe_tags(estimator) + if tags.get("requires_fit", True): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", module="sklearn") + if y is not None: + estimator.fit(X, y) + else: + estimator.fit(X) + + name = _get_check_estimator_ids(estimator) + time_pickle, time_skops = run_check(estimator, number=NUM_REPS) + + results["name"].append(name) + results["pickle (s)"].append(time_pickle) + results["skops (s)"].append(time_skops) + + format_result(results, topk=TOPK) + + +def run_check(estimator, number): + def run_pickle(): + pickle.loads(pickle.dumps(estimator)) + + def run_skops(): + sio.loads(sio.dumps(estimator), trusted=True) + + time_pickle = timeit.timeit(run_pickle, number=number) / number + time_skops = timeit.timeit(run_skops, number=number) / number + + return time_pickle, time_skops + + +def format_result(results, topk): + df = pd.DataFrame(results) + df = df.assign( + abs_diff=df["skops (s)"] - df["pickle (s)"], + rel_diff=df["skops (s)"] / df["pickle (s)"], + too_slow=lambda d: d["abs_diff"] > ATOL, + ) + + df = df.sort_values(["abs_diff"], ascending=False).reset_index(drop=True) + print(f"{topk} largest differences:") + print(df.head(10)) + + df_slow = df.query("too_slow") + if df_slow.empty: + print("No estimator was found to be unacceptably slow") + return + + print( + f"Found {len(df_slow)} estimator(s) that are at least {ATOL:.1f} sec slower " + "with skops:" + ) + print(", ".join(df_slow["name"].tolist())) + raise RuntimeError("Skops persistence too slow") + + +if __name__ == "__main__": + check_persist_performance() From b56fd3074697ed668e6b6fc20e61a6d36d60ba9f Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 2 Feb 2023 15:48:56 +0100 Subject: [PATCH 2/3] [skip ci] Change repo in GH action --- .github/workflows/persistence-performance.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/persistence-performance.yml b/.github/workflows/persistence-performance.yml index 5c7e97d1..6cc6aa7c 100644 --- a/.github/workflows/persistence-performance.yml +++ b/.github/workflows/persistence-performance.yml @@ -9,8 +9,7 @@ jobs: check-persistence-performance: runs-on: ubuntu-latest - # if: "github.repository == 'skops-dev/skops'" - if: "github.repository == 'BenjaminBossan/skops'" + if: "github.repository == 'skops-dev/skops'" timeout-minutes: 10 From 755a55ac17ee47f7c1fea699cc81ae1f11f55de0 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 28 Feb 2023 17:17:10 +0100 Subject: [PATCH 3/3] Add docstrings and annotations to script This should hopefully make it easier to understand what the functions are doing. Also had to update pre-commit dependencies because I was running into this issue: https://github.com/pre-commit/pre-commit/issues/2782 There seems to have been something wrong with its dependencies. Pretty sure it's unrelated to this PR. --- .pre-commit-config.yaml | 10 ++++---- scripts/check_persistence_performance.py | 31 +++++++++++++++++++++--- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7e72ddf0..d95e8edf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.1.0 + rev: v4.4.0 hooks: - id: check-yaml exclude: .github/conda/meta.yaml @@ -10,20 +10,20 @@ repos: - id: check-case-conflict - id: check-merge-conflict - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 23.1.0 hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 6.0.0 hooks: - id: flake8 types: [file, python] - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 5.12.0 hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.971 + rev: v1.0.1 hooks: - id: mypy args: [--config-file=pyproject.toml] diff --git a/scripts/check_persistence_performance.py b/scripts/check_persistence_performance.py index 13300205..5f6d2ca1 100644 --- a/scripts/check_persistence_performance.py +++ b/scripts/check_persistence_performance.py @@ -7,9 +7,12 @@ """ +from __future__ import annotations + import pickle import timeit import warnings +from typing import Any import pandas as pd from sklearn.utils._tags import _safe_tags @@ -27,8 +30,15 @@ TOPK = 10 # number of slowest estimators reported -def check_persist_performance(): - results = {"name": [], "pickle (s)": [], "skops (s)": []} +def check_persist_performance() -> None: + """Run all performance checks on all estimators and print results. + + For each estimator, record how long it takes to dump+load with pickle and + with skops. If any estimator takes much longer (in absolute time) with skops + than pickle, raise a RuntimeError. Print the worst results to sdtout. + + """ + results: dict[str, list[Any]] = {"name": [], "pickle (s)": [], "skops (s)": []} for estimator in _tested_estimators(): set_random_state(estimator, random_state=0) @@ -52,7 +62,14 @@ def check_persist_performance(): format_result(results, topk=TOPK) -def run_check(estimator, number): +def run_check(estimator, number: int) -> tuple[float, float]: + """Run performance check with the given estimator for pickle and skops. + + The test is run multiple times to get more robust results, ``number`` + indicates how often the it is run. + + """ + def run_pickle(): pickle.loads(pickle.dumps(estimator)) @@ -65,7 +82,13 @@ def run_skops(): return time_pickle, time_skops -def format_result(results, topk): +def format_result(results: dict[str, list[Any]], topk: int) -> None: + """Report results from performance checks. + + Print the ``topk`` slowest results. If any estimator takes much longer (in + absolute time) with skops than pickle, raise a RuntimeError. + + """ df = pd.DataFrame(results) df = df.assign( abs_diff=df["skops (s)"] - df["pickle (s)"],