Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ skops Changelog
:depth: 1
:local:

v0.5
----
- Support more array-like data types for tabular data and list-like data types
for text data. :pr:`179` by `Francesco Cariaggi`_.

v0.4
----
- :func:`.io.dump` and :func:`.io.load` now work with file like objects,
Expand Down Expand Up @@ -83,4 +88,4 @@ Contributors

:user:`Adrin Jalali <adrinjalali>`, :user:`Merve Noyan <merveenoyan>`,
:user:`Benjamin Bossan <BenjaminBossan>`, :user:`Ayyuce Demirbas
<ayyucedemirbas>`, :user:`Prajjwal Mishra <p-mishra1>`
<ayyucedemirbas>`, :user:`Prajjwal Mishra <p-mishra1>`, :user:`Francesco Cariaggi <anferico>`
4 changes: 3 additions & 1 deletion skops/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
def pandas_not_installed():
# patch import so that it raises an ImportError when trying to import
# pandas. This works because pandas is only imported lazily.
orig_import = __import__

def mock_import(name, *args, **kwargs):
if name == "pandas":
raise ImportError
return __import__(name, *args, **kwargs)
return orig_import(name, *args, **kwargs)

with patch("builtins.__import__", side_effect=mock_import):
yield
105 changes: 81 additions & 24 deletions skops/hub_utils/_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
from __future__ import annotations

import collections
import itertools
import json
import os
import shutil
from pathlib import Path
from typing import Any, List, Literal, MutableMapping, Optional, Union
from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union

import numpy as np
from huggingface_hub import HfApi, InferenceApi, snapshot_download
from sklearn.utils import check_array

SUPPORTED_TASKS = [
"tabular-classification",
Expand Down Expand Up @@ -70,16 +72,17 @@ def _validate_folder(path: Union[str, Path]) -> None:
raise TypeError(f"Model file {model_path} does not exist.")


def _get_example_input(data):
"""Returns the example input of a model.
def _get_example_input_from_tabular_data(data):
"""Returns the example input of a model for a tabular task.

The input is converted into a dictionary which is then stored in the config
file.

Parameters
----------
data: array-like
The input needs to be either a ``pandas.DataFrame`` or a
The input needs to be either a ``pandas.DataFrame``, a 2D
``numpy.ndarray`` or a list/tuple that can be converted to a 2D
``numpy.ndarray``. The first 3 rows are used as example input.

Returns
Expand All @@ -96,28 +99,77 @@ def _get_example_input(data):
# pandas is not installed, the data cannot be a pandas DataFrame
pass

# here we convert the first three rows of the numpy array to a dict of lists
# here we convert the first three rows of `data` to a dict of lists
# to be stored in the config file
if isinstance(data, np.ndarray):
return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])}
if isinstance(data, (np.ndarray, list, tuple)):
data_slice = data[:3]
# This will raise a ValueError if the array is not 2D
data_slice_array = check_array(data_slice, ensure_2d=True)
return {
f"x{x}": data_slice_array[:, x].tolist()
for x in range(data_slice_array.shape[1])
}

raise ValueError(
"The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
"list/tuple that can be converted to a 2D numpy.ndarray."
)


def _get_example_input_from_text_data(data: Sequence[str]):
"""Returns the example input of a model for a text task.

The input is converted into a dictionary which is then stored in the config
file.

Parameters
----------
data: Sequence[str]
A sequence of strings. The first 3 elements are used as example input.

Returns
-------
example_input: dict of lists
The example input of the model as accepted by Hugging Face's backend.
"""

def _head(data, n):
is_data_subscriptable = hasattr(data, "__getitem__")
if is_data_subscriptable:
return data[:n]

return list(itertools.islice(data, n))

raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")
def _is_sequence_of_strings(data):
return not isinstance(data, str) and all(isinstance(x, str) for x in data)

error_message = "The data needs to be a sequence of strings."
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There isn't really a need to assign the error message here. It is better to move it into the except part below.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I assigned the error message there is because otherwise I would have to duplicate the same string in the else and except branches, like so:

try:
    data_head = _head(data, n=3)
    if _is_sequence_of_strings(data_head):
        return {"data": data_head}
    else:
        raise ValueError("The data needs to be a sequence of strings.")
except TypeError as e:
    raise ValueError("The data needs to be a sequence of strings.") from e

Another way to avoid this would be to raise a TypeError inside the else branch, although that would feel pretty hacky.
Do you think it's fine to duplicate the error message?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I have missed that it's used at two places.

try:
data_head = _head(data, n=3)
if _is_sequence_of_strings(data_head):
return {"data": data_head}
else:
raise ValueError(error_message)
except TypeError as e:
raise ValueError(error_message) from e


def _get_column_names(data):
"""Returns the column names of the input.

If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to
``xn-1``, where ``n`` is the number of columns.
If data is not a ``pandas.DataFrame``, column names are assumed to be
``x0`` to ``xn-1``, where ``n`` is the number of columns.

Parameters
----------
data: pandas.DataFrame or numpy.ndarray
The data whose columns names are to be returned.
data: array-like
The data whose columns names are to be returned. Must be a
``pandas.DataFrame``, a 2D ``numpy.ndarray`` or a list/tuple that can
be converted to a 2D ``numpy.ndarray``.

Returns
-------
columns: list of tuples
columns: list of strings
A list of strings. Each string is a column name.
"""
try:
Expand All @@ -131,10 +183,15 @@ def _get_column_names(data):

# TODO: this is going to fail for Structured Arrays. We can add support for
# them later if we see need for it.
if isinstance(data, np.ndarray):
return [f"x{x}" for x in range(data.shape[1])]

raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")
if isinstance(data, (np.ndarray, list, tuple)):
# This will raise a ValueError if the array is not 2D
data_array = check_array(data, ensure_2d=True)
return [f"x{x}" for x in range(data_array.shape[1])]

raise ValueError(
"The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
"list/tuple that can be converted to a 2D numpy.ndarray."
)


def _create_config(
Expand Down Expand Up @@ -176,7 +233,7 @@ def _create_config(
the model. It can be one of: ``tabular-classification``,
``tabular-regression``, ``text-classification``, ``text-regression``.

data: array-like
data: array-like or sequence
The input to the model. This is used for two purposes:

1. Save an example input to the model, which is used by
Expand All @@ -186,7 +243,10 @@ def _create_config(
HuggingFace's backend to pass the data in the right form to the
model.

The first 3 input values are used as example inputs.
The first 3 input values are used as example inputs. If the task is
``tabular-classification`` or ``tabular-regression``, then data is
expected to be an array-like. Otherwise, it is expected to be an
sequence of strings.

model_format: str
The format used to persist the model. Can be ``"auto"``, ``"skops"``
Expand Down Expand Up @@ -223,13 +283,10 @@ def recursively_default_dict() -> MutableMapping:
config["sklearn"]["model_format"] = model_format

if "tabular" in task:
config["sklearn"]["example_input"] = _get_example_input(data)
config["sklearn"]["example_input"] = _get_example_input_from_tabular_data(data)
config["sklearn"]["columns"] = _get_column_names(data)
elif "text" in task:
if isinstance(data, list) and all(isinstance(x, str) for x in data):
config["sklearn"]["example_input"] = {"data": data[:3]}
else:
raise ValueError("The data needs to be a list of strings.")
config["sklearn"]["example_input"] = _get_example_input_from_text_data(data)

dump_json(Path(dst) / "config.json", config)

Expand Down
65 changes: 52 additions & 13 deletions skops/hub_utils/tests/test_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
from skops.hub_utils._hf_hub import (
_create_config,
_get_column_names,
_get_example_input,
_get_example_input_from_tabular_data,
_get_example_input_from_text_data,
_validate_folder,
)
from skops.hub_utils.tests.common import HF_HUB_TOKEN
Expand Down Expand Up @@ -230,7 +231,7 @@ def test_create_config(data, task, expected_config):


def test_create_config_invalid_text_data(temp_path):
with pytest.raises(ValueError, match="The data needs to be a list of strings."):
with pytest.raises(ValueError, match="The data needs to be a sequence of strings."):
_create_config(
model_path="model.pkl",
requirements=['scikit-learn="1.1.1"', "numpy"],
Expand Down Expand Up @@ -521,30 +522,68 @@ def test_update_env(repo_path, config_json):
assert get_requirements(repo_path) == ['scikit-learn="1.1.2"']


def test_get_example_input():
"""Test the _get_example_input function."""
def test_get_example_input_from_tabular_data():
with pytest.raises(
ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray."
ValueError,
match=(
"The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
"list/tuple that can be converted to a 2D numpy.ndarray."
),
):
_get_example_input(["a", "b", "c"])
_get_example_input_from_tabular_data("random")
with pytest.raises(ValueError):
_get_example_input_from_tabular_data(["a", "b", "c"])

examples = _get_example_input_from_tabular_data(np.ones((5, 10)))
# the result is a dictionary of column name: list of values
assert len(examples) == 10
assert len(examples["x0"]) == 3

examples = _get_example_input(np.ones((5, 10)))
# the result if a dictionary of column name: list of values
examples = _get_example_input_from_tabular_data(np.ones((5, 10)).tolist())
# the result is a dictionary of column name: list of values
assert len(examples) == 10
assert len(examples["x0"]) == 3

examples = _get_example_input(
examples = _get_example_input_from_tabular_data(
pd.DataFrame(np.ones((5, 10)), columns=[f"column{x}" for x in range(10)])
)
# the result if a dictionary of column name: list of values
# the result is a dictionary of column name: list of values
assert len(examples) == 10
assert len(examples["column0"]) == 3


@pytest.mark.parametrize(
"data, expected_length",
[
(["a", "b", "c", "d"], 3),
(np.array(["a", "b", "c", "d"]), 3),
(set(["a", "b", "c", "d"]), 3),
(tuple(["a", "b", "c", "d"]), 3),
(["a"], 1),
([], 0),
],
)
def test_get_example_input_from_text_data(data, expected_length):
example_input = _get_example_input_from_text_data(data)
assert len(example_input["data"]) == expected_length


@pytest.mark.parametrize("data", ["random", [1, 2, 3], 420])
def test_get_example_input_from_text_data_invalid_text_data(data):
with pytest.raises(ValueError, match="The data needs to be a sequence of strings."):
_get_example_input_from_text_data(data)


def test_get_column_names():
with pytest.raises(
ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray."
ValueError,
match=(
"The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
"list/tuple that can be converted to a 2D numpy.ndarray."
),
):
_get_column_names("random")
with pytest.raises(ValueError):
_get_column_names(["a", "b", "c"])

X_array = np.ones((5, 10), dtype=np.float32)
Expand All @@ -556,11 +595,11 @@ def test_get_column_names():
assert _get_column_names(X_df) == expected_columns


def test_get_example_input_pandas_not_installed(pandas_not_installed):
def test_get_example_input_from_tabular_data_pandas_not_installed(pandas_not_installed):
# use pandas_not_installed fixture from conftest.py to pretend that pandas
# is not installed and check that the function does not raise when pandas
# import fails
_get_example_input(np.ones((5, 10)))
_get_example_input_from_tabular_data(np.ones((5, 10)))


def test_get_column_names_pandas_not_installed(pandas_not_installed):
Expand Down