skops-dev · BenjaminBossan · Aug 1, 2022 · Jul 20, 2022 · Jul 22, 2022 · Jul 22, 2022
diff --git a/docs/modules/classes.rst b/docs/modules/classes.rst
@@ -9,5 +9,5 @@ This is the class and function reference of skops.
 :mod:`skops.hf_hub`: HuggingFace Hub Integration
 ================================================
 
-.. automodule:: skops.hf_hub
+.. automodule:: skops.hub_utils
     :members:
diff --git a/examples/plot_hf_hub.py b/examples/plot_hf_hub.py
@@ -76,7 +76,11 @@
 
 local_repo = mkdtemp(prefix="skops-")
 hub_utils.init(
-    model=pkl_name, requirements=[f"scikit-learn={sklearn.__version__}"], dst=local_repo
+    model=pkl_name,
+    requirements=[f"scikit-learn={sklearn.__version__}"],
+    dst=local_repo,
+    task="tabular-classification",
+    data=X_test,
 )
 
 # %%

diff --git a/examples/plot_model_card.py b/examples/plot_model_card.py
@@ -69,7 +69,11 @@
 local_repo = mkdtemp(prefix="skops-")
 
 hub_utils.init(
-    model=pkl_name, requirements=[f"scikit-learn={sklearn.__version__}"], dst=local_repo
+    model=pkl_name,
+    requirements=[f"scikit-learn={sklearn.__version__}"],
+    dst=local_repo,
+    task="tabular-classification",
+    data=X_test,
 )
 
 # %%

diff --git a/skops/_min_dependencies.py b/skops/_min_dependencies.py
@@ -25,6 +25,7 @@
     "sphinx-prompt": ("1.3.0", "docs", None),
     "matplotlib": ("3.3", "docs", None),
     "pandas": ("1", "docs", None),
+    "typing_extensions": ("3.7", "install", "python_full_version < '3.8'"),
 }
 
 

diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
@@ -8,11 +8,21 @@
 import json
 import shutil
 from pathlib import Path
-from typing import Any, MutableMapping, Union
+from typing import Any, List, MutableMapping, Union
 
+import numpy as np
 from huggingface_hub import HfApi, snapshot_download
 from requests import HTTPError
 
+from ..utils.fixes import Literal
+
+SUPPORTED_TASKS = [
+    "tabular-classification",
+    "tabular-regression",
+    "text-classification",
+    "text-regression",
+]
+
 
 def _validate_folder(path: Union[str, Path]) -> None:
     """Validate the contents of a folder.
@@ -57,8 +67,85 @@ def _validate_folder(path: Union[str, Path]) -> None:
         raise TypeError(f"Model file {model_path} does not exist.")
 
 
+def _get_example_input(data):
+    """Returns the example input of a model.
+
+    The input is converted into a dictionary which is then stored in the config
+    file.
+
+    Parameters
+    ----------
+    data: array-like
+        The input needs to be either a ``pandas.DataFrame`` or a
+        ``numpy.ndarray``. The first 3 rows are used as example input.
+
+    Returns
+    -------
+    example_input: dict of lists
+        The example input of the model as accepted by HuggingFace's backend.
+    """
+    try:
+        import pandas as pd
+
+        if isinstance(data, pd.DataFrame):
+            return {x: data[x][:3].to_list() for x in data.columns}
+    except ImportError:
+        # pandas is not installed, the data cannot be a pandas DataFrame
+        pass
+
+    # here we convert the first three rows of the numpy array to a dict of lists
+    # to be stored in the config file
+    if isinstance(data, np.ndarray):
+        return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])}
+
+    raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")
+
+
+def _get_column_names(data):
+    """Returns the column names of the input.
+
+    If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to
+    ``xn-1``, where ``n`` is the number of columns.
+
+    Parameters
+    ----------
+    data: pandas.DataFrame or numpy.ndarray
+        The data whose columns names are to be returned.
+
+    Returns
+    -------
+    columns: list of tuples
+        A list of strings. Each string is a column name.
+    """
+    try:
+        import pandas as pd
+
+        if isinstance(data, pd.DataFrame):
+            return list(data.columns)
+    except ImportError:
+        # pandas is not installed, the data cannot be a pandas DataFrame
+        pass
+
+    # TODO: this is going to fail for Structured Arrays. We can add support for
+    # them later if we see need for it.
+    if isinstance(data, np.ndarray):
+        return [f"x{x}" for x in range(data.shape[1])]
+
+    raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")
+
+
 def _create_config(
-    *, model_path: Union[str, Path], requirements: list[str], dst: Union[str, Path]
+    *,
+    model_path: Union[str, Path],
+    requirements: List[str],
+    dst: Union[str, Path],
+    task: Literal[
+        "tabular-classification",
+        "tabular-regression",
+        "text-classification",
+        "text-regression",
+    ],
+    data,
 ) -> None:
     """Write the configuration into a `config.json` file.
 
@@ -74,6 +161,25 @@ def _create_config(
     dst : str, or Path
         The path to an existing folder where the config file should be created.
 
+    task: "tabular-classification", "tabular-regression",
+    "text-classification", /
+            or "text-regression"
+        The task of the model, which determines the input and output type of
+        the model. It can be one of: ``tabular-classification``,
+        ``tabular-regression``, ``text-classification``, ``text-regression``.
+
+    data: array-like
+        The input to the model. This is used for two purposes:
+
+            1. Save an example input to the model, which is used by
+               HuggingFace's backend and shown in the widget of the model's
+               page.
+            2. Store the columns and their order of the input, which is used by
+               HuggingFace's backend to pass the data in the right form to the
+               model.
+
+        The first 3 input values are used as example inputs.
+
     Returns
     -------
     None
@@ -87,13 +193,33 @@ def recursively_default_dict() -> MutableMapping:
     config = recursively_default_dict()
     config["sklearn"]["model"]["file"] = str(model_path)
     config["sklearn"]["environment"] = requirements
+    config["sklearn"]["task"] = task
+
+    if "tabular" in task:
+        config["sklearn"]["example_input"] = _get_example_input(data)
+        config["sklearn"]["columns"] = _get_column_names(data)
+    elif "text" in task:
+        if isinstance(data, list) and all(isinstance(x, str) for x in data):
+            config["sklearn"]["example_input"] = {"data": data[:3]}
+        else:
+            raise ValueError("The data needs to be a list of strings.")
 
     with open(Path(dst) / "config.json", mode="w") as f:
         json.dump(config, f, sort_keys=True, indent=4)
 
 
 def init(
-    *, model: Union[str, Path], requirements: list[str], dst: Union[str, Path]
+    *,
+    model: Union[str, Path],
+    requirements: List[str],
+    dst: Union[str, Path],
+    task: Literal[
+        "tabular-classification",
+        "tabular-regression",
+        "text-classification",
+        "text-regression",
+    ],
+    data,
 ) -> None:
     """Initialize a scikit-learn based HuggingFace repo.
 
@@ -112,19 +238,52 @@ def init(
     dst: str, or Path
         The path to a non-existing or empty folder which is to be initialized.
 
+    task: str
+        The task of the model, which determines the input and output type of
+        the model. It can be one of: ``tabular-classification``,
+        ``tabular-regression``, ``text-classification``, ``text-regression``.
+
+    data: array-like
+        The input to the model. This is used for two purposes:
+
+            1. Save an example input to the model, which is used by
+               HuggingFace's backend and shown in the widget of the model's
+               page.
+            2. Store the columns and their order of the input, which is used by
+               HuggingFace's backend to pass the data in the right form to the
+               model.
+
+        The first 3 input values are used as example inputs.
+
+        If ``task`` is ``tabular-classification`` or ``tabular-regression``,
+        the data needs to be a ``pandas.DataFrame`` or a ``numpy.ndarray``. If
+        ``task`` is ``text-classification`` or ``text-regression``, the data
+        needs to be a ``list`` of strings.
+
     Returns
     -------
     None
     """
     dst = Path(dst)
     if dst.exists() and bool(next(dst.iterdir(), None)):
         raise OSError("None-empty dst path already exists!")
+
+    if task not in SUPPORTED_TASKS:
+        raise ValueError(
+            f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}"
+        )
     dst.mkdir(parents=True, exist_ok=True)
 
     shutil.copy2(src=model, dst=dst)
 
     model_name = Path(model).name
-    _create_config(model_path=model_name, requirements=requirements, dst=dst)
+    _create_config(
+        model_path=model_name,
+        requirements=requirements,
+        dst=dst,
+        task=task,
+        data=data,
+    )
 
 
 def update_env(
@@ -222,7 +381,7 @@ def get_config(path: Union[str, Path]) -> dict[str, Any]:
     Parameters
     ----------
     path: str
-        The path to the director holding the project and its ``config.json``
+        The path to the directory holding the project and its ``config.json``
         configuration file.
 
     Returns