Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/modules/classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ This is the class and function reference of skops.
:mod:`skops.hf_hub`: HuggingFace Hub Integration
================================================

.. automodule:: skops.hf_hub
.. automodule:: skops.hub_utils
:members:
6 changes: 5 additions & 1 deletion examples/plot_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,11 @@

local_repo = mkdtemp(prefix="skops-")
hub_utils.init(
model=pkl_name, requirements=[f"scikit-learn={sklearn.__version__}"], dst=local_repo
model=pkl_name,
requirements=[f"scikit-learn={sklearn.__version__}"],
dst=local_repo,
task="tabular-classification",
Comment thread
adrinjalali marked this conversation as resolved.
data=X_test,
)

# %%
Expand Down
6 changes: 5 additions & 1 deletion examples/plot_model_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,11 @@
local_repo = mkdtemp(prefix="skops-")

hub_utils.init(
model=pkl_name, requirements=[f"scikit-learn={sklearn.__version__}"], dst=local_repo
model=pkl_name,
requirements=[f"scikit-learn={sklearn.__version__}"],
dst=local_repo,
task="tabular-classification",
data=X_test,
)

# %%
Expand Down
1 change: 1 addition & 0 deletions skops/_min_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"sphinx-prompt": ("1.3.0", "docs", None),
"matplotlib": ("3.3", "docs", None),
"pandas": ("1", "docs", None),
"typing_extensions": ("3.7", "install", "python_full_version < '3.8'"),
}


Expand Down
169 changes: 164 additions & 5 deletions skops/hub_utils/_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,21 @@
import json
import shutil
from pathlib import Path
from typing import Any, MutableMapping, Union
from typing import Any, List, MutableMapping, Union

import numpy as np
from huggingface_hub import HfApi, snapshot_download
from requests import HTTPError

from ..utils.fixes import Literal

SUPPORTED_TASKS = [
"tabular-classification",
"tabular-regression",
"text-classification",
"text-regression",
Comment thread
adrinjalali marked this conversation as resolved.
]


def _validate_folder(path: Union[str, Path]) -> None:
"""Validate the contents of a folder.
Expand Down Expand Up @@ -57,8 +67,85 @@ def _validate_folder(path: Union[str, Path]) -> None:
raise TypeError(f"Model file {model_path} does not exist.")


def _get_example_input(data):
"""Returns the example input of a model.

The input is converted into a dictionary which is then stored in the config
file.

Parameters
----------
data: array-like
The input needs to be either a ``pandas.DataFrame`` or a
``numpy.ndarray``. The first 3 rows are used as example input.
Comment thread
adrinjalali marked this conversation as resolved.

Returns
-------
example_input: dict of lists
The example input of the model as accepted by HuggingFace's backend.
"""
try:
import pandas as pd

if isinstance(data, pd.DataFrame):
return {x: data[x][:3].to_list() for x in data.columns}
except ImportError:
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #53 for discussion on pandas dependency.

# pandas is not installed, the data cannot be a pandas DataFrame
pass

# here we convert the first three rows of the numpy array to a dict of lists
# to be stored in the config file
if isinstance(data, np.ndarray):
return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])}

raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")


def _get_column_names(data):
"""Returns the column names of the input.

If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to
``xn-1``, where ``n`` is the number of columns.

Parameters
----------
data: pandas.DataFrame or numpy.ndarray
The data whose columns names are to be returned.

Returns
-------
columns: list of tuples
A list of strings. Each string is a column name.
"""
try:
import pandas as pd

if isinstance(data, pd.DataFrame):
return list(data.columns)
except ImportError:
# pandas is not installed, the data cannot be a pandas DataFrame
pass

# TODO: this is going to fail for Structured Arrays. We can add support for
# them later if we see need for it.
if isinstance(data, np.ndarray):
return [f"x{x}" for x in range(data.shape[1])]

raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")


def _create_config(
*, model_path: Union[str, Path], requirements: list[str], dst: Union[str, Path]
*,
model_path: Union[str, Path],
requirements: List[str],
dst: Union[str, Path],
task: Literal[
"tabular-classification",
"tabular-regression",
"text-classification",
"text-regression",
],
data,
) -> None:
"""Write the configuration into a `config.json` file.

Expand All @@ -74,6 +161,25 @@ def _create_config(
dst : str, or Path
The path to an existing folder where the config file should be created.

task: "tabular-classification", "tabular-regression",
"text-classification", /
or "text-regression"
The task of the model, which determines the input and output type of
the model. It can be one of: ``tabular-classification``,
``tabular-regression``, ``text-classification``, ``text-regression``.

data: array-like
The input to the model. This is used for two purposes:

1. Save an example input to the model, which is used by
HuggingFace's backend and shown in the widget of the model's
page.
2. Store the columns and their order of the input, which is used by
HuggingFace's backend to pass the data in the right form to the
model.

The first 3 input values are used as example inputs.

Returns
-------
None
Expand All @@ -87,13 +193,33 @@ def recursively_default_dict() -> MutableMapping:
config = recursively_default_dict()
config["sklearn"]["model"]["file"] = str(model_path)
config["sklearn"]["environment"] = requirements
config["sklearn"]["task"] = task

if "tabular" in task:
config["sklearn"]["example_input"] = _get_example_input(data)
config["sklearn"]["columns"] = _get_column_names(data)
elif "text" in task:
if isinstance(data, list) and all(isinstance(x, str) for x in data):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to CountVectorizer et al, the input data needs to be an "iterable", and the docstring here says "array-like". Therefore, a user may pass, for instance, a numpy array and should work fine. Do we still want to constrain the type to be a list here?

Say we don't require lists, the code below assumes that data is Sequence (can be index). If we don't want to make that assumption, we could go for list(itertools.islice(data, 3))

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I know we're being very restrictive with the types we accept here. We also don't accept all array-like things accepted by sklearn. Adding support for those is nice, and necessary, but I was hoping to have that in a future PR and have something basic here, which doesn't necessarily limit users. They can still pass a list to this method even if their data is originally in a Series form.

Opened #65 to track this.

config["sklearn"]["example_input"] = {"data": data[:3]}
else:
raise ValueError("The data needs to be a list of strings.")

with open(Path(dst) / "config.json", mode="w") as f:
json.dump(config, f, sort_keys=True, indent=4)


def init(
*, model: Union[str, Path], requirements: list[str], dst: Union[str, Path]
*,
model: Union[str, Path],
requirements: List[str],
dst: Union[str, Path],
task: Literal[
"tabular-classification",
"tabular-regression",
"text-classification",
"text-regression",
],
data,
Comment thread
adrinjalali marked this conversation as resolved.
) -> None:
"""Initialize a scikit-learn based HuggingFace repo.

Expand All @@ -112,19 +238,52 @@ def init(
dst: str, or Path
The path to a non-existing or empty folder which is to be initialized.

task: str
The task of the model, which determines the input and output type of
the model. It can be one of: ``tabular-classification``,
``tabular-regression``, ``text-classification``, ``text-regression``.

data: array-like
The input to the model. This is used for two purposes:

1. Save an example input to the model, which is used by
HuggingFace's backend and shown in the widget of the model's
page.
2. Store the columns and their order of the input, which is used by
HuggingFace's backend to pass the data in the right form to the
model.

The first 3 input values are used as example inputs.

If ``task`` is ``tabular-classification`` or ``tabular-regression``,
the data needs to be a ``pandas.DataFrame`` or a ``numpy.ndarray``. If
``task`` is ``text-classification`` or ``text-regression``, the data
needs to be a ``list`` of strings.

Returns
-------
None
"""
dst = Path(dst)
if dst.exists() and bool(next(dst.iterdir(), None)):
raise OSError("None-empty dst path already exists!")

if task not in SUPPORTED_TASKS:
raise ValueError(
f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}"
)
dst.mkdir(parents=True, exist_ok=True)

shutil.copy2(src=model, dst=dst)

model_name = Path(model).name
_create_config(model_path=model_name, requirements=requirements, dst=dst)
_create_config(
model_path=model_name,
requirements=requirements,
dst=dst,
task=task,
data=data,
)


def update_env(
Expand Down Expand Up @@ -222,7 +381,7 @@ def get_config(path: Union[str, Path]) -> dict[str, Any]:
Parameters
----------
path: str
The path to the director holding the project and its ``config.json``
The path to the directory holding the project and its ``config.json``
Comment thread
adrinjalali marked this conversation as resolved.
configuration file.

Returns
Expand Down
Loading