From f5e83a9bc6c9c31c0240e3809545879ac58166a2 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Tue, 1 Nov 2022 22:04:19 +0100 Subject: [PATCH 01/22] feat: generate README.md in hub_utils.init --- skops/hub_utils/_hf_hub.py | 86 +++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 6d361972..f550c999 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -13,7 +13,7 @@ from typing import Any, List, MutableMapping, Optional, Union import numpy as np -from huggingface_hub import HfApi, InferenceApi, snapshot_download +from huggingface_hub import CardData, HfApi, InferenceApi, ModelCard, snapshot_download from ..utils.fixes import Literal @@ -213,6 +213,83 @@ def recursively_default_dict() -> MutableMapping: dump_json(Path(dst) / "config.json", config) +def _create_readme( + *, + model_path: Union[str, Path], + requirements: List[str], + dst: Union[str, Path], + task: Literal[ + "tabular-classification", + "tabular-regression", + "text-classification", + "text-regression", + ], + data, +) -> None: + """Write the metadata into a ``README.md`` file + + Parameters + ---------- + model_path : str, or Path + The relative path (from the repo root) to the model file. + + requirements : list of str + A list of required packages. The versions are then extracted from the + current environment. + + dst : str, or Path + The path to an existing folder where the config file should be created. + + task: "tabular-classification", "tabular-regression", + "text-classification", / + or "text-regression" + The task of the model, which determines the input and output type of + the model. It can be one of: ``tabular-classification``, + ``tabular-regression``, ``text-classification``, ``text-regression``. + + data: array-like + The input to the model. This is used for two purposes: + + 1. Save an example input to the model, which is used by + HuggingFace's backend and shown in the widget of the model's + page. + 2. Store the columns and their order of the input, which is used by + HuggingFace's backend to pass the data in the right form to the + model. + + The first 3 input values are used as example inputs. + + Returns + ------- + None + """ + card_data = CardData() + card_data.library_name = "sklearn" + card_data.tags = ["sklearn", "skops"] + card_data.task = task + if task: + card_data.tags += [task] + card_data.model_file = str(model_path) + + if "tabular" in task: + example_input = _get_example_input(data) + elif "text" in task: + if isinstance(data, list) and all(isinstance(x, str) for x in data): + example_input = {"data": data[:3]} + else: + raise ValueError("The data needs to be a list of strings.") + + # Documentation on what the widget expects: + # https://huggingface.co/docs/hub/models-widgets-examples + if example_input: + if "tabular" in task: + card_data.widget = {"structuredData": example_input} + # TODO: add text data example here. + + card = ModelCard.from_template(card_data=card_data) + card.save(Path(dst) / "README.md") + + def _check_model_file(path: str | Path) -> Path: """Perform sanity checks on the model file @@ -320,6 +397,13 @@ def init( task=task, data=data, ) + _create_readme( + model_path=model_name, + requirements=requirements, + dst=dst, + task=task, + data=data, + ) except Exception: shutil.rmtree(dst) raise From f0e9683c6a27385e8eac14cd470243a19a9b276c Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Wed, 2 Nov 2022 18:02:32 +0100 Subject: [PATCH 02/22] ref: replace _create_readme function with fewer lines --- skops/hub_utils/_hf_hub.py | 94 ++++---------------------------------- 1 file changed, 9 insertions(+), 85 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index f550c999..e2abf9cb 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -10,10 +10,13 @@ import shutil import warnings from pathlib import Path +from pickle import load from typing import Any, List, MutableMapping, Optional, Union import numpy as np -from huggingface_hub import CardData, HfApi, InferenceApi, ModelCard, snapshot_download +from huggingface_hub import HfApi, InferenceApi, snapshot_download + +from skops import card from ..utils.fixes import Literal @@ -213,83 +216,6 @@ def recursively_default_dict() -> MutableMapping: dump_json(Path(dst) / "config.json", config) -def _create_readme( - *, - model_path: Union[str, Path], - requirements: List[str], - dst: Union[str, Path], - task: Literal[ - "tabular-classification", - "tabular-regression", - "text-classification", - "text-regression", - ], - data, -) -> None: - """Write the metadata into a ``README.md`` file - - Parameters - ---------- - model_path : str, or Path - The relative path (from the repo root) to the model file. - - requirements : list of str - A list of required packages. The versions are then extracted from the - current environment. - - dst : str, or Path - The path to an existing folder where the config file should be created. - - task: "tabular-classification", "tabular-regression", - "text-classification", / - or "text-regression" - The task of the model, which determines the input and output type of - the model. It can be one of: ``tabular-classification``, - ``tabular-regression``, ``text-classification``, ``text-regression``. - - data: array-like - The input to the model. This is used for two purposes: - - 1. Save an example input to the model, which is used by - HuggingFace's backend and shown in the widget of the model's - page. - 2. Store the columns and their order of the input, which is used by - HuggingFace's backend to pass the data in the right form to the - model. - - The first 3 input values are used as example inputs. - - Returns - ------- - None - """ - card_data = CardData() - card_data.library_name = "sklearn" - card_data.tags = ["sklearn", "skops"] - card_data.task = task - if task: - card_data.tags += [task] - card_data.model_file = str(model_path) - - if "tabular" in task: - example_input = _get_example_input(data) - elif "text" in task: - if isinstance(data, list) and all(isinstance(x, str) for x in data): - example_input = {"data": data[:3]} - else: - raise ValueError("The data needs to be a list of strings.") - - # Documentation on what the widget expects: - # https://huggingface.co/docs/hub/models-widgets-examples - if example_input: - if "tabular" in task: - card_data.widget = {"structuredData": example_input} - # TODO: add text data example here. - - card = ModelCard.from_template(card_data=card_data) - card.save(Path(dst) / "README.md") - - def _check_model_file(path: str | Path) -> Path: """Perform sanity checks on the model file @@ -397,13 +323,11 @@ def init( task=task, data=data, ) - _create_readme( - model_path=model_name, - requirements=requirements, - dst=dst, - task=task, - data=data, - ) + + with open(model, "rb") as f: + model = load(f) + model_card = card.Card(model, metadata=card.metadata_from_config(dst)) + model_card.save(dst / "README.md") except Exception: shutil.rmtree(dst) raise From 1aeb14cae2d9841166e6cd2e934f0d8af06f9c78 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Thu, 3 Nov 2022 16:09:01 +0100 Subject: [PATCH 03/22] test create model card in hub_utils.init --- skops/hub_utils/tests/test_hf_hub.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 590b0169..ab31cb3b 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -271,6 +271,23 @@ def test_init(classifier_pickle, config_json): ) +def test_init_modelcard_creation(classifier_pickle, config_json): + # create a temp directory and delete it, we just need a unique name. + dir_path = tempfile.mkdtemp() + shutil.rmtree(dir_path) + + version = metadata.version("scikit-learn") + init( + model=classifier_pickle, + requirements=[f'scikit-learn="{version}"'], + dst=dir_path, + task="tabular-classification", + data=iris.data, + ) + _validate_folder(path=dir_path) + assert os.path.isfile(Path(dir_path) / "README.md") + + def test_init_no_warning_or_error(classifier_pickle, config_json): # for the happy path, there should be no warning dir_path = tempfile.mkdtemp() From 95c0e1bf2185cf7e1353269422caa2a8a7d4bdc3 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Thu, 3 Nov 2022 16:12:09 +0100 Subject: [PATCH 04/22] test override model card after created by hub_utils.init --- skops/hub_utils/tests/test_hf_hub.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index ab31cb3b..0daf2cab 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -288,6 +288,32 @@ def test_init_modelcard_creation(classifier_pickle, config_json): assert os.path.isfile(Path(dir_path) / "README.md") +def test_override_init_modelcard(classifier_pickle, config_json): + # create a temp directory and delete it, we just need a unique name. + dir_path = tempfile.mkdtemp() + shutil.rmtree(dir_path) + + version = metadata.version("scikit-learn") + init( + model=classifier_pickle, + requirements=[f'scikit-learn="{version}"'], + dst=dir_path, + task="tabular-classification", + data=iris.data, + ) + _validate_folder(path=dir_path) + t0 = os.path.getmtime(Path(dir_path) / "README.md") + + # override existent modelcard created by init + model = get_classifier() + model_card = card.Card(model, metadata=card.metadata_from_config(Path(dir_path))) + model_card.save(Path(dir_path) / "README.md") + t1 = os.path.getmtime(Path(dir_path) / "README.md") + + # compare the times at which the files were last modified + assert t0 != t1 + + def test_init_no_warning_or_error(classifier_pickle, config_json): # for the happy path, there should be no warning dir_path = tempfile.mkdtemp() From 4b6cb7392faeaf004a126c599f06b8c694ca15c9 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Mon, 14 Nov 2022 10:40:30 +0100 Subject: [PATCH 05/22] ref: deduplicate test creation of README in init --- skops/hub_utils/tests/test_hf_hub.py | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index ed42862d..4def319e 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -260,6 +260,8 @@ def test_init(classifier_pickle, config_json): ) _validate_folder(path=dir_path) + assert os.path.isfile(Path(dir_path) / "README.md") + # it should fail a second time since the folder is no longer empty. with pytest.raises(OSError, match="None-empty dst path already exists!"): init( @@ -271,23 +273,6 @@ def test_init(classifier_pickle, config_json): ) -def test_init_modelcard_creation(classifier_pickle, config_json): - # create a temp directory and delete it, we just need a unique name. - dir_path = tempfile.mkdtemp() - shutil.rmtree(dir_path) - - version = metadata.version("scikit-learn") - init( - model=classifier_pickle, - requirements=[f'scikit-learn="{version}"'], - dst=dir_path, - task="tabular-classification", - data=iris.data, - ) - _validate_folder(path=dir_path) - assert os.path.isfile(Path(dir_path) / "README.md") - - def test_override_init_modelcard(classifier_pickle, config_json): # create a temp directory and delete it, we just need a unique name. dir_path = tempfile.mkdtemp() @@ -438,7 +423,6 @@ def repo_path_for_inference(): @pytest.mark.network -@pytest.mark.inference @pytest.mark.skipif( IS_SKLEARN_DEV_BUILD, reason="Inference tests cannot run with sklearn dev build" ) @@ -458,8 +442,7 @@ def test_inference( repo_path_for_inference, destination_path, ): - # test inference backend for classifier and regressor models. This test can - # take a lot of time and be flaky. + # test inference backend for classifier and regressor models. client = HfApi() repo_path = repo_path_for_inference From 870797f70d1f7e2317ac10058687450bf8bede6d Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Mon, 14 Nov 2022 20:07:39 +0100 Subject: [PATCH 06/22] fix: check that content of new model card is modified --- skops/hub_utils/tests/test_hf_hub.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 4def319e..a9996e97 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -14,6 +14,7 @@ import sklearn from flaky import flaky from huggingface_hub import HfApi +from huggingface_hub.repocard import RepoCard from huggingface_hub.utils import RepositoryNotFoundError from sklearn.datasets import load_diabetes, load_iris from sklearn.linear_model import LinearRegression, LogisticRegression @@ -287,16 +288,21 @@ def test_override_init_modelcard(classifier_pickle, config_json): data=iris.data, ) _validate_folder(path=dir_path) - t0 = os.path.getmtime(Path(dir_path) / "README.md") - # override existent modelcard created by init + # inital card does not have a license set + with pytest.raises( + AttributeError, match="'CardData' object has no attribute 'license'" + ): + model_card = RepoCard.load(Path(dir_path) / "README.md") + model_card.data.license + + # override existent modelcard created by init with license attribute model = get_classifier() model_card = card.Card(model, metadata=card.metadata_from_config(Path(dir_path))) + model_card.metadata.license = "mit" model_card.save(Path(dir_path) / "README.md") - t1 = os.path.getmtime(Path(dir_path) / "README.md") - - # compare the times at which the files were last modified - assert t0 != t1 + new_card = RepoCard.load(Path(dir_path) / "README.md") + assert new_card.data.license == "mit" def test_init_no_warning_or_error(classifier_pickle, config_json): From f182ee1d10bb5ea1cce90e7adc860fa6c78d59b6 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Fri, 18 Nov 2022 10:25:20 +0100 Subject: [PATCH 07/22] revert lines removed by mistake --- skops/hub_utils/tests/test_hf_hub.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index a9996e97..636bf0dc 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -429,6 +429,7 @@ def repo_path_for_inference(): @pytest.mark.network +@pytest.mark.inference @pytest.mark.skipif( IS_SKLEARN_DEV_BUILD, reason="Inference tests cannot run with sklearn dev build" ) @@ -448,7 +449,8 @@ def test_inference( repo_path_for_inference, destination_path, ): - # test inference backend for classifier and regressor models. + # test inference backend for classifier and regressor models. This test can + # take a lot of time and be flaky. client = HfApi() repo_path = repo_path_for_inference From 0b6d3e26bb9bd05bb37c12583c232d5878007ee5 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Sat, 21 Jan 2023 16:59:24 +0100 Subject: [PATCH 08/22] fix: check model format of model file --- skops/hub_utils/_hf_hub.py | 14 ++++++++++---- skops/hub_utils/tests/test_hf_hub.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 39a5647d..8db283ed 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -11,6 +11,9 @@ import shutil from pathlib import Path from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union +from pickle import load as pikle_load +from skops import card, io + import numpy as np from huggingface_hub import HfApi, InferenceApi, snapshot_download @@ -409,10 +412,13 @@ def init( model_format=model_format, ) - with open(model, "rb") as f: - model = load(f) - model_card = card.Card(model, metadata=card.metadata_from_config(dst)) - model_card.save(dst / "README.md") + if model_format == 'pickle': + with open(model, "rb") as f: + model = pikle_load(f) + elif model_format == 'skops': + model = io.load(model) + model_card = card.Card(model, metadata=card.metadata_from_config(dst)) + model_card.save(dst / "README.md") except Exception: shutil.rmtree(dst) raise diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 7445e5a8..6b0aefc0 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -313,7 +313,7 @@ def test_override_init_modelcard(classifier, config_json): version = metadata.version("scikit-learn") init( - model=classifier_pickle, + model=classifier, requirements=[f'scikit-learn="{version}"'], dst=dir_path, task="tabular-classification", From 5e1494a571129747fd7360726396bdc33149a7d4 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Mon, 23 Jan 2023 22:50:46 +0100 Subject: [PATCH 09/22] fix: run pre-commit on all files --- .github/workflows/PULL_REQUEST_TEMPLATE.md | 10 +++++----- skops/hub_utils/_hf_hub.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/PULL_REQUEST_TEMPLATE.md b/.github/workflows/PULL_REQUEST_TEMPLATE.md index d73ca321..7fb46c06 100644 --- a/.github/workflows/PULL_REQUEST_TEMPLATE.md +++ b/.github/workflows/PULL_REQUEST_TEMPLATE.md @@ -1,16 +1,16 @@ #### Reference Issues/PRs @@ -29,4 +29,4 @@ review, either the pull request needs some benchmarking, tinkering, convincing, etc. or more likely the reviewers are simply busy. Thanks for contributing! ---> \ No newline at end of file +--> diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 8db283ed..eed4f380 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -10,15 +10,15 @@ import os import shutil from pathlib import Path -from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union from pickle import load as pikle_load -from skops import card, io - +from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union import numpy as np from huggingface_hub import HfApi, InferenceApi, snapshot_download from sklearn.utils import check_array +from skops import card, io + SUPPORTED_TASKS = [ "tabular-classification", "tabular-regression", @@ -412,10 +412,10 @@ def init( model_format=model_format, ) - if model_format == 'pickle': + if model_format == "pickle": with open(model, "rb") as f: model = pikle_load(f) - elif model_format == 'skops': + elif model_format == "skops": model = io.load(model) model_card = card.Card(model, metadata=card.metadata_from_config(dst)) model_card.save(dst / "README.md") From 0c4a66fdc11025ba2a06fedfba4cb63a86144989 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Wed, 1 Feb 2023 00:45:14 +0100 Subject: [PATCH 10/22] fix: check for file suffix to determine format --- skops/hub_utils/_hf_hub.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index c0d5c972..5cb03345 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -431,10 +431,11 @@ def init( use_intelex=use_intelex, ) - if model_format == "pickle": + extension = Path(model_name).suffix + if extension in [".pkl", ".pickle", ".joblib"]: with open(model, "rb") as f: model = pikle_load(f) - elif model_format == "skops": + elif extension == ".skops": model = io.load(model) model_card = card.Card(model, metadata=card.metadata_from_config(dst)) model_card.save(dst / "README.md") From eec7a8d597d08cfefa5097c0d49327af890ea5d5 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Wed, 1 Mar 2023 23:23:13 +0100 Subject: [PATCH 11/22] fix: check for model_format and suffix if auto --- skops/hub_utils/_hf_hub.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 5cb03345..41633151 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -431,12 +431,18 @@ def init( use_intelex=use_intelex, ) - extension = Path(model_name).suffix - if extension in [".pkl", ".pickle", ".joblib"]: + if model_format in ["pkl", "pickle"]: with open(model, "rb") as f: model = pikle_load(f) - elif extension == ".skops": + elif model_format == "skops": model = io.load(model) + else: # model_format is auto + extension = Path(model_name).suffix + if extension in [".pkl", ".pickle", ".joblib"]: + with open(model, "rb") as f: + model = pikle_load(f) + elif extension == ".skops": + model = io.load(model) model_card = card.Card(model, metadata=card.metadata_from_config(dst)) model_card.save(dst / "README.md") except Exception: From 1503850e84b9caca6348c14e1c44fbd0c764fa12 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Wed, 1 Mar 2023 23:26:14 +0100 Subject: [PATCH 12/22] fix: use model_format argument in test and create new parameter for auto --- skops/hub_utils/tests/test_hf_hub.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 519ccb44..e5907601 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -91,6 +91,8 @@ def classifier(repo_path, config_json): pickle.dump(clf, f) elif file_format == "skops": dump(clf, path) + elif file_format == "auto": + dump(clf, path) yield path finally: path.unlink(missing_ok=True) @@ -109,10 +111,16 @@ def classifier(repo_path, config_json): "model": {"file": "model.skops"}, } }, + "auto": { + "sklearn": { + "environment": ['scikit-learn="1.1.1"'], + "model": {"file": "model.skops"}, + } + }, } -@pytest.fixture(scope="session", params=["skops", "pickle"]) +@pytest.fixture(scope="session", params=["skops", "pickle", "auto"]) def config_json(repo_path, request): path = repo_path / "config.json" try: @@ -312,12 +320,14 @@ def test_override_init_modelcard(classifier, config_json): shutil.rmtree(dir_path) version = metadata.version("scikit-learn") + _, model_format = config_json init( model=classifier, requirements=[f'scikit-learn="{version}"'], dst=dir_path, task="tabular-classification", data=iris.data, + model_format=model_format, ) _validate_folder(path=dir_path) From 0249d41e3b03ea97707c3465e1e647402dbf3929 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Wed, 1 Mar 2023 23:40:32 +0100 Subject: [PATCH 13/22] fix: correct file format --- examples/plot_tabular_regression.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/plot_tabular_regression.py b/examples/plot_tabular_regression.py index 5e9d8fb0..3b860d44 100644 --- a/examples/plot_tabular_regression.py +++ b/examples/plot_tabular_regression.py @@ -16,7 +16,6 @@ from tempfile import mkdtemp, mkstemp import matplotlib.pyplot as plt -import pandas as pd import sklearn from sklearn.datasets import load_diabetes from sklearn.linear_model import LinearRegression @@ -42,7 +41,8 @@ # Train a Model # ============= # To train a model, we need to convert our data first to vectors. We will use -# StandardScalar in our pipeline. We will fit a Linear Regression model with the outputs of the scalar. +# StandardScalar in our pipeline. We will fit a Linear Regression +# model with the outputs of the scalar. model = Pipeline( [ ("scaler", StandardScaler()), @@ -63,7 +63,7 @@ # Initialize a repository to save our files in # ============================================ # We will now initialize a repository and save our model -_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") +_, pkl_name = mkstemp(prefix="skops-", suffix=".skops") with open(pkl_name, mode="bw") as f: sio.dump(model, file=f) From 1d23024d6f65703d159415f457f299cec2ba399b Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Wed, 8 Mar 2023 00:09:21 +0100 Subject: [PATCH 14/22] fix: argument type --- docs/examples.rst | 8 ++++---- examples/plot_custom_model_card.py | 4 +++- examples/plot_model_card.py | 6 ++++-- skops/hub_utils/_hf_hub.py | 1 + 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/examples.rst b/docs/examples.rst index 75dc5097..570f1f2a 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -4,16 +4,16 @@ Examples of interactions with the Hugging Face Hub ================================================== - Creating the Model Card: - :ref:`sphx_glr_auto_examples_plot_model_card.py` is an example of using + :ref:`sphx_glr_auto_examples_plot_model_card.py` is an example of using skops to create a model card that can be used on the Hugging Face Hub. - Putting the Model Card on the Hub: - :ref:`sphx_glr_auto_examples_plot_hf_hub.py` is an example of using skops + :ref:`sphx_glr_auto_examples_plot_hf_hub.py` is an example of using skops to put a model card on the Hugging Face Hub. - Tabular Regression: - :ref:`sphx_glr_auto_examples_plot_tabular_regression.py` is an example of using skops to serialize a tabular + :ref:`sphx_glr_auto_examples_plot_tabular_regression.py` is an example of using skops to serialize a tabular regression model and create a model card and a Hugging Face Hub repository. - Text Classification: - :ref:`sphx_glr_auto_examples_plot_text_classification.py` is an example of using skops to serialize a text + :ref:`sphx_glr_auto_examples_plot_text_classification.py` is an example of using skops to serialize a text classification model and create a model card and a Hugging Face Hub repository. - Using Intel(R) Extension for scikit-learn: :ref:`sphx_glr_auto_examples_plot_intelex.py` is an example of using diff --git a/examples/plot_custom_model_card.py b/examples/plot_custom_model_card.py index 0033950b..1385836f 100644 --- a/examples/plot_custom_model_card.py +++ b/examples/plot_custom_model_card.py @@ -163,7 +163,9 @@ display.figure_.savefig(plot_file_name) model_card.add_plot( **{ - "Regression on California Housing dataset/Results/Partial Dependence Plots": plot_file_name + "Regression on California Housing dataset/Results/Partial Dependence Plots": ( + plot_file_name + ) }, ) diff --git a/examples/plot_model_card.py b/examples/plot_model_card.py index 5e68b7e0..278a6f6d 100644 --- a/examples/plot_model_card.py +++ b/examples/plot_model_card.py @@ -158,7 +158,7 @@ model_card.add_permutation_importances( importances, X_test.columns, - plot_file=Path(local_repo) / "importance.png", + plot_file=local_repo + "/importance.png", plot_name="Permutation Importance", ) @@ -174,7 +174,9 @@ model_card.add_table( folded=True, **{ - "Model description/Evaluation Results/Hyperparameter search results": cv_results, + "Model description/Evaluation Results/Hyperparameter search results": ( + cv_results + ), "Model description/Evaluation Results/Classification report": clf_report, }, ) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 41633151..e26fd645 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -267,6 +267,7 @@ def _create_config( does not support it. For more info, see https://intel.github.io/scikit-learn-intelex/. """ + # so that we don't have to explicitly add keys and they're added as a # dictionary if they are not found # see: https://stackoverflow.com/a/13151294/2536294 From b420020f02f6b8f85acb5abc514440a58bd915b9 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Mon, 1 May 2023 21:32:51 +0200 Subject: [PATCH 15/22] fix reading file in example --- examples/plot_intelex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/plot_intelex.py b/examples/plot_intelex.py index 3b689037..ca262670 100644 --- a/examples/plot_intelex.py +++ b/examples/plot_intelex.py @@ -151,7 +151,7 @@ with NamedTemporaryFile(mode="bw", prefix="stock-", suffix=".pkl") as fp: pickle.dump(clf, file=fp) - + fp.seek(0) stock_repo = mkdtemp(prefix="stock-") hub_utils.init( model=fp.name, @@ -164,7 +164,7 @@ with NamedTemporaryFile(mode="bw", prefix="opt-", suffix=".pkl") as fp: pickle.dump(clf_opt, file=fp) - + fp.seek(0) opt_repo = mkdtemp(prefix="opt-") hub_utils.init( model=fp.name, From 3b78b3423b117bf984a6403e7e57ba7a66a49b47 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Mon, 1 May 2023 21:58:59 +0200 Subject: [PATCH 16/22] fix: add trusted argument to io.load when checking file extension --- skops/hub_utils/_hf_hub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index e26fd645..dcfef424 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -443,7 +443,7 @@ def init( with open(model, "rb") as f: model = pikle_load(f) elif extension == ".skops": - model = io.load(model) + model = io.load(model, trusted=True) model_card = card.Card(model, metadata=card.metadata_from_config(dst)) model_card.save(dst / "README.md") except Exception: From 4e424080af53e03d3b51a964c08b8e0ba4e5f583 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Fri, 26 May 2023 09:55:25 +0200 Subject: [PATCH 17/22] fix: add support for model parameter in init based on its type; update respective test --- skops/hub_utils/_hf_hub.py | 65 ++++++++++++++++++++-------- skops/hub_utils/tests/test_hf_hub.py | 31 +++++++++++-- 2 files changed, 73 insertions(+), 23 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index dcfef424..93d0e42e 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -10,11 +10,13 @@ import os import shutil from pathlib import Path -from pickle import load as pikle_load +from pickle import dump as pickle_dump +from pickle import load as pickle_load from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union import numpy as np from huggingface_hub import HfApi, InferenceApi, snapshot_download +from sklearn.base import BaseEstimator from sklearn.utils import check_array from skops import card, io @@ -408,8 +410,6 @@ def init( f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}" ) - model = _check_model_file(model) - dst.mkdir(parents=True, exist_ok=True) # add intelex requirement, if it's used and not already in requirements @@ -418,10 +418,47 @@ def init( ): requirements.append("scikit-learn-intelex") + # model parameter can be either a path or a model object try: - shutil.copy2(src=model, dst=dst) + if isinstance(model, (str, Path)): + model = _check_model_file(model) + shutil.copy2(src=model, dst=dst) + + model_name = model.name + # open model file according to its model format + if model_format in ["pkl", "pickle"]: + with open(model, "rb") as f: + model = pickle_load(f) + elif model_format == "skops": + model = io.load(model) + else: # model_format is auto + extension = Path(model_name).suffix + if extension in [".pkl", ".pickle", ".joblib"]: + with open(model, "rb") as f: # not tested + model = pickle_load(f) # not tested + elif extension == ".skops": + model = io.load(model, trusted=True) + elif isinstance(model, BaseEstimator): + # if it is a model object and its format is set to auto, choose skops by default + if model_format == "auto": + model_format = "skops" + elif model_format in ["pkl", "pickle", "joblib"]: + model_format = "pickle" + model_name = Path(dst / f"model.{model_format}") + + # create model file if it doesn't exist to make a valid repository + if not os.path.isfile(model_name): + if model_format == "pickle": + with open(model_name, "wb") as f: + pickle_dump(model, f) + elif model_format == "skops": + io.dump(model, model_name) + else: + raise ValueError( + "Cannot determine the input model argument. " + "Please indicate a model with the expected type." + ) - model_name = model.name _create_config( model_path=model_name, requirements=requirements, @@ -432,20 +469,10 @@ def init( use_intelex=use_intelex, ) - if model_format in ["pkl", "pickle"]: - with open(model, "rb") as f: - model = pikle_load(f) - elif model_format == "skops": - model = io.load(model) - else: # model_format is auto - extension = Path(model_name).suffix - if extension in [".pkl", ".pickle", ".joblib"]: - with open(model, "rb") as f: - model = pikle_load(f) - elif extension == ".skops": - model = io.load(model, trusted=True) - model_card = card.Card(model, metadata=card.metadata_from_config(dst)) - model_card.save(dst / "README.md") + # create README if it doesn't exist + if not os.path.isfile(dst / "README.md"): + model_card = card.Card(model, metadata=card.metadata_from_config(dst)) + model_card.save(dst / "README.md") except Exception: shutil.rmtree(dst) raise diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index e5907601..d8e27c79 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -86,7 +86,7 @@ def classifier(repo_path, config_json): path = repo_path / model_file try: - if file_format == "pickle": + if file_format == "pickle" or file_format == "joblib": with open(path, "wb") as f: pickle.dump(clf, f) elif file_format == "skops": @@ -117,10 +117,16 @@ def classifier(repo_path, config_json): "model": {"file": "model.skops"}, } }, + "joblib": { + "sklearn": { + "environment": ['scikit-learn="1.1.1"'], + "model": {"file": "model.joblib"}, + } + }, } -@pytest.fixture(scope="session", params=["skops", "pickle", "auto"]) +@pytest.fixture(scope="session", params=["skops", "pickle", "auto", "joblib"]) def config_json(repo_path, request): path = repo_path / "config.json" try: @@ -314,15 +320,32 @@ def test_init(classifier, config_json): ) -def test_override_init_modelcard(classifier, config_json): +@pytest.fixture( + params=[pytest.param("classifier", marks=pytest.mark.usefixtures), get_classifier()] +) +def classifiers(request): + # Returns a model object or a path to a model with all + # model formats combinations from CONFIG dict + try: + yield request.getfixturevalue(request.param) + except Exception: # get_classifier() is not a fixuture, exception raised + yield request.param + + +def test_override_init_modelcard(classifiers, config_json): # create a temp directory and delete it, we just need a unique name. dir_path = tempfile.mkdtemp() shutil.rmtree(dir_path) version = metadata.version("scikit-learn") _, model_format = config_json + # joblib type falls unders auto format, explicityly set to auto + # because we can't repeat key "auto" in CONFIG dict + if model_format == "joblib": + model_format = "auto" + init( - model=classifier, + model=classifiers, requirements=[f'scikit-learn="{version}"'], dst=dir_path, task="tabular-classification", From 61388852c18ba99e82e9b0c0cdc8b2f3a793e82c Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Fri, 26 May 2023 10:13:45 +0200 Subject: [PATCH 18/22] fix: revert files changes --- examples/plot_custom_model_card.py | 4 +--- examples/plot_model_card.py | 2 +- examples/plot_tabular_regression.py | 6 +++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/plot_custom_model_card.py b/examples/plot_custom_model_card.py index 1385836f..0033950b 100644 --- a/examples/plot_custom_model_card.py +++ b/examples/plot_custom_model_card.py @@ -163,9 +163,7 @@ display.figure_.savefig(plot_file_name) model_card.add_plot( **{ - "Regression on California Housing dataset/Results/Partial Dependence Plots": ( - plot_file_name - ) + "Regression on California Housing dataset/Results/Partial Dependence Plots": plot_file_name }, ) diff --git a/examples/plot_model_card.py b/examples/plot_model_card.py index d40c6be4..ca6c57a6 100644 --- a/examples/plot_model_card.py +++ b/examples/plot_model_card.py @@ -158,7 +158,7 @@ model_card.add_permutation_importances( importances, X_test.columns, - plot_file=local_repo + "/importance.png", + plot_file=Path(local_repo) / "importance.png", plot_name="Permutation Importance", ) diff --git a/examples/plot_tabular_regression.py b/examples/plot_tabular_regression.py index 3b860d44..5e9d8fb0 100644 --- a/examples/plot_tabular_regression.py +++ b/examples/plot_tabular_regression.py @@ -16,6 +16,7 @@ from tempfile import mkdtemp, mkstemp import matplotlib.pyplot as plt +import pandas as pd import sklearn from sklearn.datasets import load_diabetes from sklearn.linear_model import LinearRegression @@ -41,8 +42,7 @@ # Train a Model # ============= # To train a model, we need to convert our data first to vectors. We will use -# StandardScalar in our pipeline. We will fit a Linear Regression -# model with the outputs of the scalar. +# StandardScalar in our pipeline. We will fit a Linear Regression model with the outputs of the scalar. model = Pipeline( [ ("scaler", StandardScaler()), @@ -63,7 +63,7 @@ # Initialize a repository to save our files in # ============================================ # We will now initialize a repository and save our model -_, pkl_name = mkstemp(prefix="skops-", suffix=".skops") +_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") with open(pkl_name, mode="bw") as f: sio.dump(model, file=f) From dd49d5c05bef1f584be77fb36bd7d35cf09c1ce2 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Fri, 26 May 2023 21:30:12 +0200 Subject: [PATCH 19/22] change file format in example to skops --- examples/plot_tabular_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_tabular_regression.py b/examples/plot_tabular_regression.py index 5e9d8fb0..b7bdd6b3 100644 --- a/examples/plot_tabular_regression.py +++ b/examples/plot_tabular_regression.py @@ -63,7 +63,7 @@ # Initialize a repository to save our files in # ============================================ # We will now initialize a repository and save our model -_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") +_, pkl_name = mkstemp(prefix="skops-", suffix=".skops") with open(pkl_name, mode="bw") as f: sio.dump(model, file=f) From 21c3a5f6ad4f7b206ce4a47a438f19ce244f19d3 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Fri, 2 Jun 2023 09:55:24 +0200 Subject: [PATCH 20/22] fix: revert to always create README if model is str/Path and handle if model is an estimator to create README --- skops/hub_utils/_hf_hub.py | 42 +++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 93d0e42e..24419654 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -425,6 +425,17 @@ def init( shutil.copy2(src=model, dst=dst) model_name = model.name + + _create_config( + model_path=model_name, + requirements=requirements, + dst=dst, + task=task, + data=data, + model_format=model_format, + use_intelex=use_intelex, + ) + # open model file according to its model format if model_format in ["pkl", "pickle"]: with open(model, "rb") as f: @@ -438,6 +449,8 @@ def init( model = pickle_load(f) # not tested elif extension == ".skops": model = io.load(model, trusted=True) + model_card = card.Card(model, metadata=card.metadata_from_config(dst)) + model_card.save(dst / "README.md") elif isinstance(model, BaseEstimator): # if it is a model object and its format is set to auto, choose skops by default if model_format == "auto": @@ -446,6 +459,16 @@ def init( model_format = "pickle" model_name = Path(dst / f"model.{model_format}") + _create_config( + model_path=model_name, + requirements=requirements, + dst=dst, + task=task, + data=data, + model_format=model_format, + use_intelex=use_intelex, + ) + # create model file if it doesn't exist to make a valid repository if not os.path.isfile(model_name): if model_format == "pickle": @@ -453,26 +476,17 @@ def init( pickle_dump(model, f) elif model_format == "skops": io.dump(model, model_name) + + # create README if it doesn't exist + if not os.path.isfile(dst / "README.md"): + model_card = card.Card(model, metadata=card.metadata_from_config(dst)) + model_card.save(dst / "README.md") else: raise ValueError( "Cannot determine the input model argument. " "Please indicate a model with the expected type." ) - _create_config( - model_path=model_name, - requirements=requirements, - dst=dst, - task=task, - data=data, - model_format=model_format, - use_intelex=use_intelex, - ) - - # create README if it doesn't exist - if not os.path.isfile(dst / "README.md"): - model_card = card.Card(model, metadata=card.metadata_from_config(dst)) - model_card.save(dst / "README.md") except Exception: shutil.rmtree(dst) raise From 98e64b47a95654d888858ff07e761a2a193ddf62 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Wed, 12 Jul 2023 20:26:49 +0200 Subject: [PATCH 21/22] replace open file block of code with _load_model function --- skops/hub_utils/_hf_hub.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 24419654..e0000112 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -11,7 +11,6 @@ import shutil from pathlib import Path from pickle import dump as pickle_dump -from pickle import load as pickle_load from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union import numpy as np @@ -20,6 +19,7 @@ from sklearn.utils import check_array from skops import card, io +from skops.card._model_card import _load_model SUPPORTED_TASKS = [ "tabular-classification", @@ -436,19 +436,8 @@ def init( use_intelex=use_intelex, ) - # open model file according to its model format - if model_format in ["pkl", "pickle"]: - with open(model, "rb") as f: - model = pickle_load(f) - elif model_format == "skops": - model = io.load(model) - else: # model_format is auto - extension = Path(model_name).suffix - if extension in [".pkl", ".pickle", ".joblib"]: - with open(model, "rb") as f: # not tested - model = pickle_load(f) # not tested - elif extension == ".skops": - model = io.load(model, trusted=True) + # load model from file + model = _load_model(model) model_card = card.Card(model, metadata=card.metadata_from_config(dst)) model_card.save(dst / "README.md") elif isinstance(model, BaseEstimator): From ffdb1e8494fc0b9b1279f6027b096a819f844406 Mon Sep 17 00:00:00 2001 From: Juan Camacho Mohedano Date: Thu, 13 Jul 2023 19:31:56 +0200 Subject: [PATCH 22/22] trust types in model file --- skops/hub_utils/_hf_hub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index e0000112..9f120471 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -437,7 +437,7 @@ def init( ) # load model from file - model = _load_model(model) + model = _load_model(model, trusted=True) model_card = card.Card(model, metadata=card.metadata_from_config(dst)) model_card.save(dst / "README.md") elif isinstance(model, BaseEstimator):