diff --git a/docs/modules/classes.rst b/docs/modules/classes.rst index 840a9e61..89d1c21b 100644 --- a/docs/modules/classes.rst +++ b/docs/modules/classes.rst @@ -9,5 +9,5 @@ This is the class and function reference of skops. :mod:`skops.hf_hub`: HuggingFace Hub Integration ================================================ -.. automodule:: skops.hf_hub +.. automodule:: skops.hub_utils :members: diff --git a/examples/plot_hf_hub.py b/examples/plot_hf_hub.py index 819e539d..fef5507e 100644 --- a/examples/plot_hf_hub.py +++ b/examples/plot_hf_hub.py @@ -76,7 +76,11 @@ local_repo = mkdtemp(prefix="skops-") hub_utils.init( - model=pkl_name, requirements=[f"scikit-learn={sklearn.__version__}"], dst=local_repo + model=pkl_name, + requirements=[f"scikit-learn={sklearn.__version__}"], + dst=local_repo, + task="tabular-classification", + data=X_test, ) # %% diff --git a/examples/plot_model_card.py b/examples/plot_model_card.py index 1d504f81..36e8f8a6 100644 --- a/examples/plot_model_card.py +++ b/examples/plot_model_card.py @@ -69,7 +69,11 @@ local_repo = mkdtemp(prefix="skops-") hub_utils.init( - model=pkl_name, requirements=[f"scikit-learn={sklearn.__version__}"], dst=local_repo + model=pkl_name, + requirements=[f"scikit-learn={sklearn.__version__}"], + dst=local_repo, + task="tabular-classification", + data=X_test, ) # %% diff --git a/skops/_min_dependencies.py b/skops/_min_dependencies.py index 1c93db3a..2fba8bcc 100644 --- a/skops/_min_dependencies.py +++ b/skops/_min_dependencies.py @@ -25,6 +25,7 @@ "sphinx-prompt": ("1.3.0", "docs", None), "matplotlib": ("3.3", "docs", None), "pandas": ("1", "docs", None), + "typing_extensions": ("3.7", "install", "python_full_version < '3.8'"), } diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 7c612560..902ba54f 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -8,11 +8,21 @@ import json import shutil from pathlib import Path -from typing import Any, MutableMapping, Union +from typing import Any, List, MutableMapping, Union +import numpy as np from huggingface_hub import HfApi, snapshot_download from requests import HTTPError +from ..utils.fixes import Literal + +SUPPORTED_TASKS = [ + "tabular-classification", + "tabular-regression", + "text-classification", + "text-regression", +] + def _validate_folder(path: Union[str, Path]) -> None: """Validate the contents of a folder. @@ -57,8 +67,85 @@ def _validate_folder(path: Union[str, Path]) -> None: raise TypeError(f"Model file {model_path} does not exist.") +def _get_example_input(data): + """Returns the example input of a model. + + The input is converted into a dictionary which is then stored in the config + file. + + Parameters + ---------- + data: array-like + The input needs to be either a ``pandas.DataFrame`` or a + ``numpy.ndarray``. The first 3 rows are used as example input. + + Returns + ------- + example_input: dict of lists + The example input of the model as accepted by HuggingFace's backend. + """ + try: + import pandas as pd + + if isinstance(data, pd.DataFrame): + return {x: data[x][:3].to_list() for x in data.columns} + except ImportError: + # pandas is not installed, the data cannot be a pandas DataFrame + pass + + # here we convert the first three rows of the numpy array to a dict of lists + # to be stored in the config file + if isinstance(data, np.ndarray): + return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])} + + raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") + + +def _get_column_names(data): + """Returns the column names of the input. + + If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to + ``xn-1``, where ``n`` is the number of columns. + + Parameters + ---------- + data: pandas.DataFrame or numpy.ndarray + The data whose columns names are to be returned. + + Returns + ------- + columns: list of tuples + A list of strings. Each string is a column name. + """ + try: + import pandas as pd + + if isinstance(data, pd.DataFrame): + return list(data.columns) + except ImportError: + # pandas is not installed, the data cannot be a pandas DataFrame + pass + + # TODO: this is going to fail for Structured Arrays. We can add support for + # them later if we see need for it. + if isinstance(data, np.ndarray): + return [f"x{x}" for x in range(data.shape[1])] + + raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") + + def _create_config( - *, model_path: Union[str, Path], requirements: list[str], dst: Union[str, Path] + *, + model_path: Union[str, Path], + requirements: List[str], + dst: Union[str, Path], + task: Literal[ + "tabular-classification", + "tabular-regression", + "text-classification", + "text-regression", + ], + data, ) -> None: """Write the configuration into a `config.json` file. @@ -74,6 +161,25 @@ def _create_config( dst : str, or Path The path to an existing folder where the config file should be created. + task: "tabular-classification", "tabular-regression", + "text-classification", / + or "text-regression" + The task of the model, which determines the input and output type of + the model. It can be one of: ``tabular-classification``, + ``tabular-regression``, ``text-classification``, ``text-regression``. + + data: array-like + The input to the model. This is used for two purposes: + + 1. Save an example input to the model, which is used by + HuggingFace's backend and shown in the widget of the model's + page. + 2. Store the columns and their order of the input, which is used by + HuggingFace's backend to pass the data in the right form to the + model. + + The first 3 input values are used as example inputs. + Returns ------- None @@ -87,13 +193,33 @@ def recursively_default_dict() -> MutableMapping: config = recursively_default_dict() config["sklearn"]["model"]["file"] = str(model_path) config["sklearn"]["environment"] = requirements + config["sklearn"]["task"] = task + + if "tabular" in task: + config["sklearn"]["example_input"] = _get_example_input(data) + config["sklearn"]["columns"] = _get_column_names(data) + elif "text" in task: + if isinstance(data, list) and all(isinstance(x, str) for x in data): + config["sklearn"]["example_input"] = {"data": data[:3]} + else: + raise ValueError("The data needs to be a list of strings.") with open(Path(dst) / "config.json", mode="w") as f: json.dump(config, f, sort_keys=True, indent=4) def init( - *, model: Union[str, Path], requirements: list[str], dst: Union[str, Path] + *, + model: Union[str, Path], + requirements: List[str], + dst: Union[str, Path], + task: Literal[ + "tabular-classification", + "tabular-regression", + "text-classification", + "text-regression", + ], + data, ) -> None: """Initialize a scikit-learn based HuggingFace repo. @@ -112,6 +238,28 @@ def init( dst: str, or Path The path to a non-existing or empty folder which is to be initialized. + task: str + The task of the model, which determines the input and output type of + the model. It can be one of: ``tabular-classification``, + ``tabular-regression``, ``text-classification``, ``text-regression``. + + data: array-like + The input to the model. This is used for two purposes: + + 1. Save an example input to the model, which is used by + HuggingFace's backend and shown in the widget of the model's + page. + 2. Store the columns and their order of the input, which is used by + HuggingFace's backend to pass the data in the right form to the + model. + + The first 3 input values are used as example inputs. + + If ``task`` is ``tabular-classification`` or ``tabular-regression``, + the data needs to be a ``pandas.DataFrame`` or a ``numpy.ndarray``. If + ``task`` is ``text-classification`` or ``text-regression``, the data + needs to be a ``list`` of strings. + Returns ------- None @@ -119,12 +267,23 @@ def init( dst = Path(dst) if dst.exists() and bool(next(dst.iterdir(), None)): raise OSError("None-empty dst path already exists!") + + if task not in SUPPORTED_TASKS: + raise ValueError( + f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}" + ) dst.mkdir(parents=True, exist_ok=True) shutil.copy2(src=model, dst=dst) model_name = Path(model).name - _create_config(model_path=model_name, requirements=requirements, dst=dst) + _create_config( + model_path=model_name, + requirements=requirements, + dst=dst, + task=task, + data=data, + ) def update_env( @@ -222,7 +381,7 @@ def get_config(path: Union[str, Path]) -> dict[str, Any]: Parameters ---------- path: str - The path to the director holding the project and its ``config.json`` + The path to the directory holding the project and its ``config.json`` configuration file. Returns diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index c0542a0b..d7736752 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -6,16 +6,32 @@ from pathlib import Path from uuid import uuid4 +import numpy as np +import pandas as pd import pytest from flaky import flaky from huggingface_hub import HfApi +from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from skops.hub_utils import download, get_config, get_requirements, init, push -from skops.hub_utils._hf_hub import _create_config, _validate_folder +from skops.hub_utils._hf_hub import ( + _create_config, + _get_column_names, + _get_example_input, + _validate_folder, +) from skops.hub_utils.tests.common import HF_HUB_TOKEN from skops.utils.fixes import metadata, path_unlink +iris = load_iris(as_frame=True, return_X_y=False) + + +@pytest.fixture +def temp_path(): + with tempfile.TemporaryDirectory(prefix="skops-test-temp-path") as temp_path: + yield temp_path + @pytest.fixture(scope="session") def repo_path(): @@ -30,10 +46,11 @@ def destination_path(): @pytest.fixture(scope="session") -def model_pickle(repo_path): +def classifier_pickle(repo_path): # Create a simple pickle file for the purpose of testing - clf = LogisticRegression() - clf.fit([[0, 1], [1, 0]], [0, 1]) + X, y = iris.data, iris.target + clf = LogisticRegression(solver="newton-cg") + clf.fit(X, y) path = repo_path / "model.pickle" try: @@ -90,45 +107,118 @@ def test_validate_folder(config_json): _validate_folder(path=dir_path) -def test_create_config(): +@pytest.mark.parametrize( + "data, task, expected_config", + [ + ( + iris.data, + "tabular-classification", + { + "sklearn": { + "columns": [ + "petal length (cm)", + "petal width (cm)", + "sepal length (cm)", + "sepal width (cm)", + ], + "environment": ['scikit-learn="1.1.1"', "numpy"], + "example_input": { + "petal length (cm)": [1.4, 1.4, 1.3], + "petal width (cm)": [0.2, 0.2, 0.2], + "sepal length (cm)": [5.1, 4.9, 4.7], + "sepal width (cm)": [3.5, 3.0, 3.2], + }, + "model": {"file": "model.pkl"}, + "task": "tabular-classification", + } + }, + ), + ( + ["test", "text", "problem", "random"], + "text-classification", + { + "sklearn": { + "environment": ['scikit-learn="1.1.1"', "numpy"], + "example_input": {"data": ["test", "text", "problem"]}, + "model": {"file": "model.pkl"}, + "task": "text-classification", + } + }, + ), + ], +) +def test_create_config(data, task, expected_config): dir_path = tempfile.mkdtemp() _create_config( model_path="model.pkl", requirements=['scikit-learn="1.1.1"', "numpy"], dst=dir_path, + task=task, + data=data, ) - config_content = { - "sklearn": { - "environment": ['scikit-learn="1.1.1"', "numpy"], - "model": {"file": "model.pkl"}, - } - } - with open(Path(dir_path) / "config.json") as f: config = json.load(f) - assert config == config_content + for key in ["environment", "model", "task"]: + assert config["sklearn"][key] == expected_config["sklearn"][key] + + keys = ["example_input"] + if "tabular" in task: + # text data doesn't introduce any "columns" in the configuration + keys += ["columns"] + for key in keys: + assert sorted(config["sklearn"][key]) == sorted( + expected_config["sklearn"][key] + ) + + +def test_create_config_invalid_text_data(temp_path): + with pytest.raises(ValueError, match="The data needs to be a list of strings."): + _create_config( + model_path="model.pkl", + requirements=['scikit-learn="1.1.1"', "numpy"], + task="text-classification", + data=[1, 2, 3], + dst=temp_path, + ) + + +def test_init_invalid_task(classifier_pickle, temp_path): + with pytest.raises( + ValueError, match="Task invalid not supported. Supported tasks are" + ): + init( + model=classifier_pickle, + requirements=["scikit-learn"], + dst=temp_path, + task="invalid", + data=iris.data, + ) -def test_init(model_pickle, config_json): +def test_init(classifier_pickle, config_json): # create a temp directory and delete it, we just need a unique name. dir_path = tempfile.mkdtemp() shutil.rmtree(dir_path) version = metadata.version("scikit-learn") init( - model=model_pickle, + model=classifier_pickle, requirements=[f'scikit-learn="{version}"'], dst=dir_path, + task="tabular-classification", + data=iris.data, ) _validate_folder(path=dir_path) # it should fail a second time since the folder is no longer empty. with pytest.raises(OSError, match="None-empty dst path already exists!"): init( - model=model_pickle, + model=classifier_pickle, requirements=[f'scikit-learn="{version}"'], dst=dir_path, + task="tabular-classification", + data=iris.data, ) @@ -136,15 +226,21 @@ def test_init(model_pickle, config_json): @flaky(max_runs=3) @pytest.mark.parametrize("explicit_create", [True, False]) def test_push_download( - explicit_create, repo_path, destination_path, model_pickle, config_json + explicit_create, + repo_path, + destination_path, + classifier_pickle, + config_json, ): client = HfApi() version = metadata.version("scikit-learn") init( - model=model_pickle, + model=classifier_pickle, requirements=[f'scikit-learn="{version}"'], dst=destination_path, + task="tabular-classification", + data=iris.data, ) user = client.whoami(token=HF_HUB_TOKEN)["name"] @@ -163,7 +259,7 @@ def test_push_download( download(repo_id=repo_id, dst=destination_path) files = client.list_repo_files(repo_id=repo_id, token=HF_HUB_TOKEN) - for f_name in [model_pickle.name, config_json.name]: + for f_name in [classifier_pickle.name, config_json.name]: assert f_name in files try: @@ -179,3 +275,38 @@ def test_get_config(repo_path): config = get_config(repo_path) assert config == CONFIG assert get_requirements(repo_path) == ['scikit-learn="1.1.1"'] + + +def test_get_example_input(): + """Test the _get_example_input function.""" + with pytest.raises( + ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray." + ): + _get_example_input(["a", "b", "c"]) + + examples = _get_example_input(np.ones((5, 10))) + # the result if a dictionary of column name: list of values + assert len(examples) == 10 + assert len(examples["x0"]) == 3 + + examples = _get_example_input( + pd.DataFrame(np.ones((5, 10)), columns=[f"column{x}" for x in range(10)]) + ) + # the result if a dictionary of column name: list of values + assert len(examples) == 10 + assert len(examples["column0"]) == 3 + + +def test_get_column_names(): + with pytest.raises( + ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray." + ): + _get_column_names(["a", "b", "c"]) + + X_array = np.ones((5, 10), dtype=np.float32) + expected_columns = [f"x{x}" for x in range(10)] + assert _get_column_names(X_array) == expected_columns + + expected_columns = [f"column{x}" for x in range(10)] + X_df = pd.DataFrame(X_array, columns=expected_columns) + assert _get_column_names(X_df) == expected_columns diff --git a/skops/utils/fixes.py b/skops/utils/fixes.py index b6c4963c..e9d83558 100644 --- a/skops/utils/fixes.py +++ b/skops/utils/fixes.py @@ -12,6 +12,15 @@ # older pythons import importlib_metadata as metadata # noqa +if sys.version_info >= (3, 8): + # py>=3.8 + from typing import Literal # noqa +else: + # older pythons, this requires typing_extensions to be installed. + # if you're removing this, you should also remove the dependency from + # _min_dependencies.py + from typing_extensions import Literal # noqa + def path_unlink(path: Path, missing_ok: bool = False) -> None: """Remove this file or symbolic link