From 0bbaf42635528b7ed54bc448d5df803bc3ab7590 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 20 Jul 2022 17:21:26 +0200 Subject: [PATCH 01/16] ENH add examples and dtypes to CardData and config.json --- examples/plot_hf_hub.py | 13 ++- skops/card/_model_card.py | 37 +++++-- skops/hub_utils/_hf_hub.py | 148 ++++++++++++++++++++++++++- skops/hub_utils/tests/test_hf_hub.py | 4 + 4 files changed, 187 insertions(+), 15 deletions(-) diff --git a/examples/plot_hf_hub.py b/examples/plot_hf_hub.py index d1fb4f4c..c79da92d 100644 --- a/examples/plot_hf_hub.py +++ b/examples/plot_hf_hub.py @@ -15,12 +15,12 @@ import json import os import pickle +from pathlib import Path from tempfile import mkdtemp, mkstemp from uuid import uuid4 import sklearn from huggingface_hub import HfApi -from modelcards import CardData from sklearn.datasets import load_breast_cancer from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.experimental import enable_halving_search_cv # noqa @@ -76,7 +76,11 @@ local_repo = mkdtemp(prefix="skops-") hub_utils.init( - model=pkl_name, requirements=[f"scikit-learn={sklearn.__version__}"], dst=local_repo + model=pkl_name, + requirements=[f"scikit-learn={sklearn.__version__}"], + dst=local_repo, + task="tabular-classification", + data=X_test, ) # %% @@ -86,9 +90,8 @@ # %% # Model Card # ========== -card_data = CardData(tags=["tabular-classification"]) -model_card = card.create_model_card(model, card_data) -model_card.save(os.path.join(f"{local_repo}", "README.md")) +model_card = card.create_model_card(local_repo) +model_card.save(Path(local_repo) / "README.md") # %% # Push to Hub diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index 49b7f1ac..b5e27863 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -1,11 +1,15 @@ import os +import pickle import re +from pathlib import Path -from modelcards import ModelCard +from modelcards import CardData, ModelCard from sklearn.utils import estimator_html_repr import skops +from ..hub_utils import get_config + def _extract_estimator_config(model): """Extracts estimator configuration and renders them into a vertical table. @@ -27,23 +31,44 @@ def _extract_estimator_config(model): def create_model_card( - model, - card_data, + path, + card_data=None, **card_kwargs, ): """Creates a model card for the model and saves it to the target directory. Parameters: ---------- - model: estimator - scikit-learn compatible estimator. - card_data: CardData + path: str + The path to the local directory containing the model and corresponding + configuration file. + card_data: CardData, optional CardData object. card_kwargs: Card kwargs are information you can pass to fill in the sections of the card template, e.g. model_description, citation_bibtex, get_started_code. """ ROOT = skops.__path__ + + # Load the model from the existing directory. + config = get_config(path) + model_path = Path(path) / config["sklearn"]["model"]["file"] + with open(model_path, "rb") as f: + model = pickle.load(f) + + card_data = card_data or CardData() + card_data.tags = card_data.tags or list() + + # Read relevant info from the config file and add them to the CardData + # object. + if "sklearn" not in card_data.tags: + card_data.tags += ["sklearn"] + + if config["sklearn"]["task"] not in card_data.tags: + card_data.tags += [config["sklearn"]["task"]] + + setattr(card_data, "widget", config["sklearn"]["example_input"]) + model_plot = re.sub(r"\n\s+", "", str(estimator_html_repr(model))) hyperparameter_table = _extract_estimator_config(model) card_data.library_name = "sklearn" diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 1e214f36..de245362 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -9,9 +9,17 @@ from pathlib import Path from typing import List, Union +import numpy as np from huggingface_hub import HfApi, snapshot_download from requests import HTTPError +SUPPORTED_TASKS = [ + "tabular-classification", + "tabular-regression", + "text-classification", + "text-regression", +] + def _validate_folder(path: Union[str, Path]): """Validate the contents of a folder. @@ -56,7 +64,76 @@ def _validate_folder(path: Union[str, Path]): raise TypeError(f"Model file {model_path} does not exist.") -def _create_config(*, model_path: str, requirements: List[str], dst: str): +def _get_example_input(data): + """Returns the example input of a model. + + The input is converted into a dictionary which is then stored in the config + file. + + Parameters + ---------- + data: array-like + The input needs to be either a ``pandas.DataFrame`` or a + ``numpy.ndarray``. The first 3 rows are used as example input. + + Returns + ------- + example_input: dict of lists + The example input of the model as accepted by HuggingFace's backend. + """ + try: + import pandas as pd + + if isinstance(data, pd.DataFrame): + return {x: data[x][:3].to_list() for x in data.columns} + except ImportError: + # pandas is not installed, the data cannot be a pandas DataFrame + pass + + # here we convert the first three rows of the numpy array to a dict of lists + # to be stored in the config file + if isinstance(data, np.ndarray): + return {x: data[:3, x].tolist() for x in range(data.shape[1])} + + raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") + + +def _get_column_dtypes(data): + """Returns the dtype of the columns of the data. + + If data is a numpy.ndarray, column names are assumed to be ``X0`` to + ``Xn-1``, where ``n`` is the number of columns. + + Parameters + ---------- + data: pandas.DataFrame or numpy.ndarray + The data whose columns along with their dtypes are to be returned. + + Returns + ------- + columns: list of tuples + A list of tuples of the form (column name, dtype). + """ + try: + import pandas as pd + + if isinstance(data, pd.DataFrame): + return list(zip(data.dtypes.index, data.dtypes.astype("str"))) + except ImportError: + # pandas is not installed, the data cannot be a pandas DataFrame + pass + + # TODO: this is going to fail for Structured Arrays. We can add support for + # them later if we see need for it. + if isinstance(data, np.ndarray): + return [(f"X{x}", str(data.dtype)) for x in range(data.shape[1])] + + raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") + + +def _create_config( + *, model_path: str, requirements: List[str], dst: str, task: str, data +): """Write the configuration into a `config.json` file. Parameters @@ -71,6 +148,21 @@ def _create_config(*, model_path: str, requirements: List[str], dst: str): dst : str, or Path The path to an existing folder where the config file should be created. + task: str + The task of the model, which determines the input and output type of + the model. It can be one of: ``tabular-classification``, + ``tabular-regression``, ``text-classification``, ``text-regression``. + + data: array-like + The input to the model. This is used for two purposes: + + 1. Save an example input to the model, which is used by HuggingFace's + backend and shown in the widget of the model's page. + 2. Store the dtype of the input, which is used by HugfingFace's backend + to pass the data with the right dtype to the model. + + The first 3 input values are used as example inputs. + Returns ------- None @@ -84,12 +176,29 @@ def recursively_default_dict(): config = recursively_default_dict() config["sklearn"]["model"]["file"] = model_path config["sklearn"]["environment"] = requirements + config["sklearn"]["task"] = task + + if "tabular" in task: + config["sklearn"]["example_input"] = _get_example_input(data) + config["sklearn"]["columns"] = _get_column_dtypes(data) + elif "text" in task: + if isinstance(data, list): + config["sklearn"]["example_input"] = {"data": data[:3]} + else: + raise ValueError("The data needs to be a list of strings.") with open(Path(dst) / "config.json", mode="w") as f: json.dump(config, f, sort_keys=True, indent=4) -def init(*, model: Union[str, Path], requirements: List[str], dst: Union[str, Path]): +def init( + *, + model: Union[str, Path], + requirements: List[str], + dst: Union[str, Path], + task: str, + data, +): """Initialize a scikit-learn based HuggingFace repo. Given a model pickle and a set of required packages, this function @@ -107,6 +216,26 @@ def init(*, model: Union[str, Path], requirements: List[str], dst: Union[str, Pa dst: str, or Path The path to a non-existing or empty folder which is to be initialized. + task: str + The task of the model, which determines the input and output type of + the model. It can be one of: ``tabular-classification``, + ``tabular-regression``, ``text-classification``, ``text-regression``. + + data: array-like + The input to the model. This is used for two purposes: + + 1. Save an example input to the model, which is used by HuggingFace's + backend and shown in the widget of the model's page. + 2. Store the dtype of the input, which is used by HugfingFace's backend + to pass the data with the right dtype to the model. + + The first 3 input values are used as example inputs. + + If ``task`` is ``tabular-classification`` or ``tabular-regression``, + the data needs to be a ``pandas.DataFrame`` or a ``numpy.ndarray``. If + ``task`` is ``text-classification`` or ``text-regression``, the data + needs to be a ``list`` of strings. + Returns ------- None @@ -114,12 +243,23 @@ def init(*, model: Union[str, Path], requirements: List[str], dst: Union[str, Pa dst = Path(dst) if dst.exists() and next(dst.iterdir(), None): raise OSError("None-empty dst path already exists!") + + if task not in SUPPORTED_TASKS: + raise ValueError( + f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}" + ) dst.mkdir(parents=True, exist_ok=True) shutil.copy2(src=model, dst=dst) model_name = Path(model).name - _create_config(model_path=model_name, requirements=requirements, dst=dst) + _create_config( + model_path=model_name, + requirements=requirements, + dst=dst, + task=task, + data=data, + ) def update_env(*, path: Union[str, Path], requirements: List[str] = None): @@ -215,7 +355,7 @@ def get_config(path: Union[str, Path]): Parameters ---------- path: str - The path to the director holding the project and its ``config.json`` + The path to the directory holding the project and its ``config.json`` configuration file. Returns diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 678bbade..a34f82d6 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -57,6 +57,7 @@ def test_create_config(): model_path="model.pkl", requirements=['scikit-learn="1.1.1"', "numpy"], dst=dir_path, + data=None, ) config_content = { @@ -81,6 +82,7 @@ def test_init(): model=_get_cwd() / "sample_repo/model.pkl", requirements=[f'scikit-learn="{version}"'], dst=dir_path, + data=None, ) _validate_folder(path=dir_path) @@ -90,6 +92,7 @@ def test_init(): model=_get_cwd() / "sample_repo/model.pkl", requirements=[f'scikit-learn="{version}"'], dst=dir_path, + data=None, ) @@ -102,6 +105,7 @@ def test_push_download(explicit_create): model=_get_cwd() / "sample_repo/model.pkl", requirements=[f'scikit-learn="{version}"'], dst=dir_path, + data=None, ) user = client.whoami(token=HF_HUB_TOKEN)["name"] From 65b191c5df102811966a649b4b0710badf1a0288 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 22 Jul 2022 14:07:57 +0200 Subject: [PATCH 02/16] address some feedback --- skops/hub_utils/_hf_hub.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index de245362..efc4a101 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -7,7 +7,7 @@ import json import shutil from pathlib import Path -from typing import List, Union +from typing import List, Literal, Union import numpy as np from huggingface_hub import HfApi, snapshot_download @@ -132,7 +132,17 @@ def _get_column_dtypes(data): def _create_config( - *, model_path: str, requirements: List[str], dst: str, task: str, data + *, + model_path: str, + requirements: List[str], + dst: str, + task: Literal[ + "tabular-classification", + "tabular-regression", + "text-classification", + "text-regression", + ], + data, ): """Write the configuration into a `config.json` file. @@ -148,7 +158,9 @@ def _create_config( dst : str, or Path The path to an existing folder where the config file should be created. - task: str + task: "tabular-classification", "tabular-regression", + "text-classification", / + or "text-regression" The task of the model, which determines the input and output type of the model. It can be one of: ``tabular-classification``, ``tabular-regression``, ``text-classification``, ``text-regression``. @@ -156,10 +168,11 @@ def _create_config( data: array-like The input to the model. This is used for two purposes: - 1. Save an example input to the model, which is used by HuggingFace's - backend and shown in the widget of the model's page. - 2. Store the dtype of the input, which is used by HugfingFace's backend - to pass the data with the right dtype to the model. + 1. Save an example input to the model, which is used by + HuggingFace's backend and shown in the widget of the model's + page. + 2. Store the dtype of the input, which is used by HuggingFace's + backend to pass the data with the right dtype to the model. The first 3 input values are used as example inputs. @@ -182,7 +195,7 @@ def recursively_default_dict(): config["sklearn"]["example_input"] = _get_example_input(data) config["sklearn"]["columns"] = _get_column_dtypes(data) elif "text" in task: - if isinstance(data, list): + if isinstance(data, list) and all(isinstance(x, str) for x in data): config["sklearn"]["example_input"] = {"data": data[:3]} else: raise ValueError("The data needs to be a list of strings.") From 475bc43c47a2c386764003bdf8565802e9bad876 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 25 Jul 2022 12:39:09 +0200 Subject: [PATCH 03/16] address more comments --- skops/card/_model_card.py | 2 +- skops/hub_utils/_hf_hub.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index b5e27863..29a10d28 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -67,7 +67,7 @@ def create_model_card( if config["sklearn"]["task"] not in card_data.tags: card_data.tags += [config["sklearn"]["task"]] - setattr(card_data, "widget", config["sklearn"]["example_input"]) + card_data.widget = config["sklearn"]["example_input"] model_plot = re.sub(r"\n\s+", "", str(estimator_html_repr(model))) hyperparameter_table = _extract_estimator_config(model) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index efc4a101..d9cdb73e 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -126,7 +126,7 @@ def _get_column_dtypes(data): # TODO: this is going to fail for Structured Arrays. We can add support for # them later if we see need for it. if isinstance(data, np.ndarray): - return [(f"X{x}", str(data.dtype)) for x in range(data.shape[1])] + return [(f"x{x}", str(data.dtype)) for x in range(data.shape[1])] raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") From d85a7f49ddff4a358d624b6d7b0d0d6384afa40d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 25 Jul 2022 13:28:24 +0200 Subject: [PATCH 04/16] improve tests --- skops/hub_utils/tests/test_hf_hub.py | 67 +++++++++++++++++++++------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index e83355b5..a283123b 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -8,6 +8,7 @@ import pytest from huggingface_hub import HfApi +from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from skops.hub_utils import download, get_config, get_requirements, init, push @@ -29,10 +30,16 @@ def destination_path(): @pytest.fixture(scope="session") -def model_pickle(repo_path): +def classification_data(): + return load_iris(return_X_y=True, as_frame=True) + + +@pytest.fixture(scope="session") +def classifier_pickle(repo_path, classification_data): # Create a simple pickle file for the purpose of testing - clf = LogisticRegression() - clf.fit([[0, 1], [1, 0]], [0, 1]) + X, y = classification_data + clf = LogisticRegression(solver="newton-cg") + clf.fit(X, y) path = repo_path / "model.pickle" try: @@ -89,63 +96,91 @@ def test_validate_folder(config_json): _validate_folder(path=dir_path) -def test_create_config(): +def test_create_config(classification_data): dir_path = tempfile.mkdtemp() _create_config( model_path="model.pkl", requirements=['scikit-learn="1.1.1"', "numpy"], dst=dir_path, - data=None, + task="tabular-classification", + data=classification_data[0], ) - config_content = { + config_expected = { "sklearn": { + "columns": [ + ["petal length (cm)", "float64"], + ["petal width (cm)", "float64"], + ["sepal length (cm)", "float64"], + ["sepal width (cm)", "float64"], + ], "environment": ['scikit-learn="1.1.1"', "numpy"], + "example_input": { + "petal length (cm)": [1.4, 1.4, 1.3], + "petal width (cm)": [0.2, 0.2, 0.2], + "sepal length (cm)": [5.1, 4.9, 4.7], + "sepal width (cm)": [3.5, 3.0, 3.2], + }, "model": {"file": "model.pkl"}, + "task": "tabular-classification", } } with open(Path(dir_path) / "config.json") as f: config = json.load(f) - assert config == config_content + for key in ["environment", "model", "task"]: + assert config["sklearn"][key] == config_expected["sklearn"][key] + + for key in ["columns", "example_input"]: + assert sorted(config["sklearn"][key]) == sorted( + config_expected["sklearn"][key] + ) -def test_init(model_pickle, config_json): +def test_init(classifier_pickle, classification_data, config_json): # create a temp directory and delete it, we just need a unique name. dir_path = tempfile.mkdtemp() shutil.rmtree(dir_path) version = metadata.version("scikit-learn") init( - model=model_pickle, + model=classifier_pickle, requirements=[f'scikit-learn="{version}"'], dst=dir_path, - data=None, + task="tabular-classification", + data=classification_data[0], ) _validate_folder(path=dir_path) # it should fail a second time since the folder is no longer empty. with pytest.raises(OSError, match="None-empty dst path already exists!"): init( - model=model_pickle, + model=classifier_pickle, requirements=[f'scikit-learn="{version}"'], dst=dir_path, - data=None, + task="tabular-classification", + data=classification_data[0], ) @pytest.mark.parametrize("explicit_create", [True, False]) def test_push_download( - explicit_create, repo_path, destination_path, model_pickle, config_json + explicit_create, + repo_path, + destination_path, + classifier_pickle, + classification_data, + config_json, ): client = HfApi() version = metadata.version("scikit-learn") init( - model=model_pickle, + model=classifier_pickle, requirements=[f'scikit-learn="{version}"'], dst=destination_path, - data=None, + task="tabular-classification", + data=classification_data[0], ) user = client.whoami(token=HF_HUB_TOKEN)["name"] @@ -164,7 +199,7 @@ def test_push_download( download(repo_id=repo_id, dst=destination_path) files = client.list_repo_files(repo_id=repo_id, token=HF_HUB_TOKEN) - for f_name in [model_pickle.name, config_json.name]: + for f_name in [classifier_pickle.name, config_json.name]: assert f_name in files try: From 01c507bddf0cdcaf37ef233f67ac13f657e8031d Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 25 Jul 2022 13:59:09 +0200 Subject: [PATCH 05/16] fix model card tests --- skops/card/tests/test_card.py | 83 ++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/skops/card/tests/test_card.py b/skops/card/tests/test_card.py index a5fb2fe9..24cb775b 100644 --- a/skops/card/tests/test_card.py +++ b/skops/card/tests/test_card.py @@ -1,26 +1,48 @@ -import os +import pickle import tempfile +from pathlib import Path import numpy as np +import pytest +import sklearn from modelcards import CardData from sklearn.linear_model import LinearRegression +from skops import hub_utils from skops.card import create_model_card -def fit_model(): +@pytest.fixture +def temp_path(): + with tempfile.TemporaryDirectory(prefix="skops-test") as dir_path: + yield Path(dir_path) + + +@pytest.fixture +def model_folder(temp_path): X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) y = np.dot(X, np.array([1, 2])) + 3 reg = LinearRegression().fit(X, y) - return reg + with tempfile.TemporaryDirectory(prefix="skops-test") as dir_path: + model_path = Path(dir_path) / "model.pkl" + with open(model_path, "wb") as f: + pickle.dump(reg, f) + hub_utils.init( + model=model_path, + requirements=[f"scikit-learn{sklearn.__version__}"], + task="tabular-classification", + data=X, + dst=temp_path, + ) + yield Path(temp_path) -def generate_card(): - model = fit_model() +@pytest.fixture +def sample_card(model_folder): card_data = CardData(library_name="sklearn") model_card = create_model_card( - model, + model_folder, card_data, template_path="skops/card/default_template.md", model_description="sklearn FTW", @@ -28,32 +50,31 @@ def generate_card(): return model_card -def test_write_model_card(): - with tempfile.TemporaryDirectory(prefix="skops-test") as dir_path: - model = fit_model() - card_data = CardData(library_name="sklearn") - model_card = create_model_card( - model, card_data=card_data, model_description="sklearn FTW" - ) - model_card.save(os.path.join(f"{dir_path}", "README.md")) - with open(os.path.join(f"{dir_path}", "README.md"), "r") as f: - model_card = f.read() - assert "sklearn FTW" in model_card +def test_write_model_card(model_folder): + card_data = CardData(library_name="sklearn") + model_card = create_model_card( + model_folder, + card_data=card_data, + model_description="sklearn FTW", + ) + card_path = model_folder / "README.md" + model_card.save(card_path) + with open(card_path, "r") as f: + model_card = f.read() + assert "sklearn FTW" in model_card -def test_hyperparameter_table(): - with tempfile.TemporaryDirectory(prefix="skops-test") as dir_path: - model_card = generate_card() - model_card.save(os.path.join(f"{dir_path}", "README.md")) - with open(os.path.join(f"{dir_path}", "README.md"), "r") as f: - model_card = f.read() - assert "fit_intercept" in model_card +def test_hyperparameter_table(sample_card, temp_path): + card_path = temp_path / "README.md" + sample_card.save(card_path) + with open(card_path, "r") as f: + model_card = f.read() + assert "fit_intercept" in model_card -def test_plot_model(): - with tempfile.TemporaryDirectory(prefix="skops-test") as dir_path: - model_card = generate_card() - model_card.save(os.path.join(f"{dir_path}", "README.md")) - with open(os.path.join(f"{dir_path}", "README.md"), "r") as f: - model_card = f.read() - assert "