diff --git a/docs/changes.rst b/docs/changes.rst index f1aa7739..4e21b8ea 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -9,6 +9,11 @@ skops Changelog :depth: 1 :local: +v0.5 +---- +- Support more array-like data types for tabular data and list-like data types + for text data. :pr:`179` by `Francesco Cariaggi`_. + v0.4 ---- - :func:`.io.dump` and :func:`.io.load` now work with file like objects, @@ -83,4 +88,4 @@ Contributors :user:`Adrin Jalali `, :user:`Merve Noyan `, :user:`Benjamin Bossan `, :user:`Ayyuce Demirbas -`, :user:`Prajjwal Mishra ` +`, :user:`Prajjwal Mishra `, :user:`Francesco Cariaggi ` diff --git a/skops/conftest.py b/skops/conftest.py index a09c69b8..4dcaed83 100644 --- a/skops/conftest.py +++ b/skops/conftest.py @@ -7,10 +7,12 @@ def pandas_not_installed(): # patch import so that it raises an ImportError when trying to import # pandas. This works because pandas is only imported lazily. + orig_import = __import__ + def mock_import(name, *args, **kwargs): if name == "pandas": raise ImportError - return __import__(name, *args, **kwargs) + return orig_import(name, *args, **kwargs) with patch("builtins.__import__", side_effect=mock_import): yield diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index d76e1de3..f91a6664 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -5,14 +5,16 @@ from __future__ import annotations import collections +import itertools import json import os import shutil from pathlib import Path -from typing import Any, List, Literal, MutableMapping, Optional, Union +from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union import numpy as np from huggingface_hub import HfApi, InferenceApi, snapshot_download +from sklearn.utils import check_array SUPPORTED_TASKS = [ "tabular-classification", @@ -70,8 +72,8 @@ def _validate_folder(path: Union[str, Path]) -> None: raise TypeError(f"Model file {model_path} does not exist.") -def _get_example_input(data): - """Returns the example input of a model. +def _get_example_input_from_tabular_data(data): + """Returns the example input of a model for a tabular task. The input is converted into a dictionary which is then stored in the config file. @@ -79,7 +81,8 @@ def _get_example_input(data): Parameters ---------- data: array-like - The input needs to be either a ``pandas.DataFrame`` or a + The input needs to be either a ``pandas.DataFrame``, a 2D + ``numpy.ndarray`` or a list/tuple that can be converted to a 2D ``numpy.ndarray``. The first 3 rows are used as example input. Returns @@ -96,28 +99,77 @@ def _get_example_input(data): # pandas is not installed, the data cannot be a pandas DataFrame pass - # here we convert the first three rows of the numpy array to a dict of lists + # here we convert the first three rows of `data` to a dict of lists # to be stored in the config file - if isinstance(data, np.ndarray): - return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])} + if isinstance(data, (np.ndarray, list, tuple)): + data_slice = data[:3] + # This will raise a ValueError if the array is not 2D + data_slice_array = check_array(data_slice, ensure_2d=True) + return { + f"x{x}": data_slice_array[:, x].tolist() + for x in range(data_slice_array.shape[1]) + } + + raise ValueError( + "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a " + "list/tuple that can be converted to a 2D numpy.ndarray." + ) + + +def _get_example_input_from_text_data(data: Sequence[str]): + """Returns the example input of a model for a text task. + + The input is converted into a dictionary which is then stored in the config + file. + + Parameters + ---------- + data: Sequence[str] + A sequence of strings. The first 3 elements are used as example input. + + Returns + ------- + example_input: dict of lists + The example input of the model as accepted by Hugging Face's backend. + """ + + def _head(data, n): + is_data_subscriptable = hasattr(data, "__getitem__") + if is_data_subscriptable: + return data[:n] + + return list(itertools.islice(data, n)) - raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") + def _is_sequence_of_strings(data): + return not isinstance(data, str) and all(isinstance(x, str) for x in data) + + error_message = "The data needs to be a sequence of strings." + try: + data_head = _head(data, n=3) + if _is_sequence_of_strings(data_head): + return {"data": data_head} + else: + raise ValueError(error_message) + except TypeError as e: + raise ValueError(error_message) from e def _get_column_names(data): """Returns the column names of the input. - If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to - ``xn-1``, where ``n`` is the number of columns. + If data is not a ``pandas.DataFrame``, column names are assumed to be + ``x0`` to ``xn-1``, where ``n`` is the number of columns. Parameters ---------- - data: pandas.DataFrame or numpy.ndarray - The data whose columns names are to be returned. + data: array-like + The data whose columns names are to be returned. Must be a + ``pandas.DataFrame``, a 2D ``numpy.ndarray`` or a list/tuple that can + be converted to a 2D ``numpy.ndarray``. Returns ------- - columns: list of tuples + columns: list of strings A list of strings. Each string is a column name. """ try: @@ -131,10 +183,15 @@ def _get_column_names(data): # TODO: this is going to fail for Structured Arrays. We can add support for # them later if we see need for it. - if isinstance(data, np.ndarray): - return [f"x{x}" for x in range(data.shape[1])] - - raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") + if isinstance(data, (np.ndarray, list, tuple)): + # This will raise a ValueError if the array is not 2D + data_array = check_array(data, ensure_2d=True) + return [f"x{x}" for x in range(data_array.shape[1])] + + raise ValueError( + "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a " + "list/tuple that can be converted to a 2D numpy.ndarray." + ) def _create_config( @@ -176,7 +233,7 @@ def _create_config( the model. It can be one of: ``tabular-classification``, ``tabular-regression``, ``text-classification``, ``text-regression``. - data: array-like + data: array-like or sequence The input to the model. This is used for two purposes: 1. Save an example input to the model, which is used by @@ -186,7 +243,10 @@ def _create_config( HuggingFace's backend to pass the data in the right form to the model. - The first 3 input values are used as example inputs. + The first 3 input values are used as example inputs. If the task is + ``tabular-classification`` or ``tabular-regression``, then data is + expected to be an array-like. Otherwise, it is expected to be an + sequence of strings. model_format: str The format used to persist the model. Can be ``"auto"``, ``"skops"`` @@ -223,13 +283,10 @@ def recursively_default_dict() -> MutableMapping: config["sklearn"]["model_format"] = model_format if "tabular" in task: - config["sklearn"]["example_input"] = _get_example_input(data) + config["sklearn"]["example_input"] = _get_example_input_from_tabular_data(data) config["sklearn"]["columns"] = _get_column_names(data) elif "text" in task: - if isinstance(data, list) and all(isinstance(x, str) for x in data): - config["sklearn"]["example_input"] = {"data": data[:3]} - else: - raise ValueError("The data needs to be a list of strings.") + config["sklearn"]["example_input"] = _get_example_input_from_text_data(data) dump_json(Path(dst) / "config.json", config) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index c4db9aee..cc0511f0 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -33,7 +33,8 @@ from skops.hub_utils._hf_hub import ( _create_config, _get_column_names, - _get_example_input, + _get_example_input_from_tabular_data, + _get_example_input_from_text_data, _validate_folder, ) from skops.hub_utils.tests.common import HF_HUB_TOKEN @@ -230,7 +231,7 @@ def test_create_config(data, task, expected_config): def test_create_config_invalid_text_data(temp_path): - with pytest.raises(ValueError, match="The data needs to be a list of strings."): + with pytest.raises(ValueError, match="The data needs to be a sequence of strings."): _create_config( model_path="model.pkl", requirements=['scikit-learn="1.1.1"', "numpy"], @@ -521,30 +522,68 @@ def test_update_env(repo_path, config_json): assert get_requirements(repo_path) == ['scikit-learn="1.1.2"'] -def test_get_example_input(): - """Test the _get_example_input function.""" +def test_get_example_input_from_tabular_data(): with pytest.raises( - ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray." + ValueError, + match=( + "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a " + "list/tuple that can be converted to a 2D numpy.ndarray." + ), ): - _get_example_input(["a", "b", "c"]) + _get_example_input_from_tabular_data("random") + with pytest.raises(ValueError): + _get_example_input_from_tabular_data(["a", "b", "c"]) + + examples = _get_example_input_from_tabular_data(np.ones((5, 10))) + # the result is a dictionary of column name: list of values + assert len(examples) == 10 + assert len(examples["x0"]) == 3 - examples = _get_example_input(np.ones((5, 10))) - # the result if a dictionary of column name: list of values + examples = _get_example_input_from_tabular_data(np.ones((5, 10)).tolist()) + # the result is a dictionary of column name: list of values assert len(examples) == 10 assert len(examples["x0"]) == 3 - examples = _get_example_input( + examples = _get_example_input_from_tabular_data( pd.DataFrame(np.ones((5, 10)), columns=[f"column{x}" for x in range(10)]) ) - # the result if a dictionary of column name: list of values + # the result is a dictionary of column name: list of values assert len(examples) == 10 assert len(examples["column0"]) == 3 +@pytest.mark.parametrize( + "data, expected_length", + [ + (["a", "b", "c", "d"], 3), + (np.array(["a", "b", "c", "d"]), 3), + (set(["a", "b", "c", "d"]), 3), + (tuple(["a", "b", "c", "d"]), 3), + (["a"], 1), + ([], 0), + ], +) +def test_get_example_input_from_text_data(data, expected_length): + example_input = _get_example_input_from_text_data(data) + assert len(example_input["data"]) == expected_length + + +@pytest.mark.parametrize("data", ["random", [1, 2, 3], 420]) +def test_get_example_input_from_text_data_invalid_text_data(data): + with pytest.raises(ValueError, match="The data needs to be a sequence of strings."): + _get_example_input_from_text_data(data) + + def test_get_column_names(): with pytest.raises( - ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray." + ValueError, + match=( + "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a " + "list/tuple that can be converted to a 2D numpy.ndarray." + ), ): + _get_column_names("random") + with pytest.raises(ValueError): _get_column_names(["a", "b", "c"]) X_array = np.ones((5, 10), dtype=np.float32) @@ -556,11 +595,11 @@ def test_get_column_names(): assert _get_column_names(X_df) == expected_columns -def test_get_example_input_pandas_not_installed(pandas_not_installed): +def test_get_example_input_from_tabular_data_pandas_not_installed(pandas_not_installed): # use pandas_not_installed fixture from conftest.py to pretend that pandas # is not installed and check that the function does not raise when pandas # import fails - _get_example_input(np.ones((5, 10))) + _get_example_input_from_tabular_data(np.ones((5, 10))) def test_get_column_names_pandas_not_installed(pandas_not_installed):