From 782b9e083db1cbaef4de2a3ffdab1541e01581e6 Mon Sep 17 00:00:00 2001 From: anferico Date: Sun, 9 Oct 2022 17:41:06 +0200 Subject: [PATCH 01/11] add support for more data types --- skops/hub_utils/_hf_hub.py | 81 ++++++++++++++++++---------- skops/hub_utils/tests/test_hf_hub.py | 4 +- 2 files changed, 56 insertions(+), 29 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 5d7a629b..708c0bf1 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -5,6 +5,7 @@ from __future__ import annotations import collections +import itertools import json import os import shutil @@ -73,6 +74,14 @@ def _validate_folder(path: Union[str, Path]) -> None: raise TypeError(f"Model file {model_path} does not exist.") +def _convert_to_2d_numpy_array(data): + data_array = np.asarray(data) + if len(data_array.shape) != 2: + raise ValueError("The data must be convertible to a 2D numpy.ndarray.") + + return data_array + + def _get_example_input(data): """Returns the example input of a model. @@ -81,9 +90,10 @@ def _get_example_input(data): Parameters ---------- - data: array-like - The input needs to be either a ``pandas.DataFrame`` or a - ``numpy.ndarray``. The first 3 rows are used as example input. + data: pandas.DataFrame or array-like + The input needs to be anything that can be converted to a 2D + ``numpy.ndarray``, including a ``pandas.DataFrame``. The first 3 rows + are used as example input. Returns ------- @@ -95,28 +105,29 @@ def _get_example_input(data): if isinstance(data, pd.DataFrame): return {x: data[x][:3].to_list() for x in data.columns} - except ImportError: - # pandas is not installed, the data cannot be a pandas DataFrame - pass + except ImportError as e: + raise ValueError( + "The data cannot be a pandas.DataFrame because pandas is not installed." + ) from e # here we convert the first three rows of the numpy array to a dict of lists # to be stored in the config file - if isinstance(data, np.ndarray): - return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])} - - raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") + data_array = _convert_to_2d_numpy_array(data) + return {f"x{x}": data_array[:3, x].tolist() for x in range(data_array.shape[1])} def _get_column_names(data): """Returns the column names of the input. - If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to - ``xn-1``, where ``n`` is the number of columns. + If data is not a ``pandas.DataFrame``, column names are assumed to be + ``x0`` to ``xn-1``, where ``n`` is the number of columns. Parameters ---------- - data: pandas.DataFrame or numpy.ndarray - The data whose columns names are to be returned. + data: pandas.DataFrame or array-like + The data whose columns names are to be returned. Must be a + ``pandas.DataFrame`` or anything that can be converted to a 2D + ``numpy.ndarray`` Returns ------- @@ -128,16 +139,13 @@ def _get_column_names(data): if isinstance(data, pd.DataFrame): return list(data.columns) - except ImportError: - # pandas is not installed, the data cannot be a pandas DataFrame - pass - - # TODO: this is going to fail for Structured Arrays. We can add support for - # them later if we see need for it. - if isinstance(data, np.ndarray): - return [f"x{x}" for x in range(data.shape[1])] + except ImportError as e: + raise ValueError( + "The data cannot be a pandas.DataFrame because pandas is not installed." + ) from e - raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") + data_array = _convert_to_2d_numpy_array(data) + return [f"x{x}" for x in range(data_array.shape[1])] def _create_config( @@ -174,7 +182,7 @@ def _create_config( the model. It can be one of: ``tabular-classification``, ``tabular-regression``, ``text-classification``, ``text-regression``. - data: array-like + data: array-like, or iterable The input to the model. This is used for two purposes: 1. Save an example input to the model, which is used by @@ -184,7 +192,10 @@ def _create_config( HuggingFace's backend to pass the data in the right form to the model. - The first 3 input values are used as example inputs. + The first 3 input values are used as example inputs. If the task is + ``tabular-classification`` or ``tabular-regression``, then data is + expected to be an array-like. Otherwise, it is expected to be an + iterable of strings. Returns ------- @@ -205,14 +216,28 @@ def recursively_default_dict() -> MutableMapping: config["sklearn"]["example_input"] = _get_example_input(data) config["sklearn"]["columns"] = _get_column_names(data) elif "text" in task: - if isinstance(data, list) and all(isinstance(x, str) for x in data): - config["sklearn"]["example_input"] = {"data": data[:3]} + if _is_iterable_of_strings(data): + config["sklearn"]["example_input"] = { + "data": list(itertools.islice(data, 3)) + } else: - raise ValueError("The data needs to be a list of strings.") + raise ValueError("The data needs to be an iterable of strings.") dump_json(Path(dst) / "config.json", config) +def _is_iterable_of_strings(data): + if isinstance(data, str): + return False + try: + # needed in case data is an iterator or a generator + data, data_copy = itertools.tee(data, 2) + return all(isinstance(x, str) for x in data_copy) + except TypeError: + # data is not iterable + return False + + def _check_model_file(path: str | Path) -> Path: """Perform sanity checks on the model file diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 362c2664..346cb7a7 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -197,7 +197,9 @@ def test_create_config(data, task, expected_config): def test_create_config_invalid_text_data(temp_path): - with pytest.raises(ValueError, match="The data needs to be a list of strings."): + with pytest.raises( + ValueError, match="The data needs to be an iterable of strings." + ): _create_config( model_path="model.pkl", requirements=['scikit-learn="1.1.1"', "numpy"], From fe798d89091cc256bdfcf1b3ab8f06251e1467e7 Mon Sep 17 00:00:00 2001 From: anferico Date: Tue, 11 Oct 2022 18:25:38 +0200 Subject: [PATCH 02/11] Add docstrings --- skops/hub_utils/_hf_hub.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 708c0bf1..c393a184 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -75,6 +75,26 @@ def _validate_folder(path: Union[str, Path]) -> None: def _convert_to_2d_numpy_array(data): + """Converts an array-like object to a 2D numpy.ndarray. + + Raises a ``ValueError`` if data cannot be converted to a 2D numpy.ndarray. + + Parameters + ---------- + data: pandas.DataFrame or array-like + Any object that can be converted to a 2D ``numpy.ndarray``, including + a ``pandas.DataFrame``. + + Raises + ------ + ValueError + Raised when the passed object cannot be converted to 2D numpy.ndarray. + + Returns + ------- + data_array: numpy.ndarray + The numpy.ndarray object obtained by converting data. + """ data_array = np.asarray(data) if len(data_array.shape) != 2: raise ValueError("The data must be convertible to a 2D numpy.ndarray.") @@ -227,6 +247,19 @@ def recursively_default_dict() -> MutableMapping: def _is_iterable_of_strings(data): + """Checks whether data is an iterable of strings. + + Parameters + ---------- + data: Any + Any object. + + Returns + ------- + is_iterable_of_strings: bool + A boolean variable indicating whether or not data is an iterable of + strings. + """ if isinstance(data, str): return False try: From a8f5e4d43ad464b6da8e301e913baffb21b7ce99 Mon Sep 17 00:00:00 2001 From: anferico Date: Tue, 11 Oct 2022 20:02:53 +0200 Subject: [PATCH 03/11] Update unit tests --- skops/hub_utils/tests/test_hf_hub.py | 64 +++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 346cb7a7..ea8ab507 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -157,6 +157,42 @@ def test_validate_folder(config_json): } }, ), + ( + iris.data.values, + "tabular-classification", + { + "sklearn": { + "columns": ["x0", "x1", "x2", "x3"], + "environment": ['scikit-learn="1.1.1"', "numpy"], + "example_input": { + "x0": [1.4, 1.4, 1.3], + "x1": [0.2, 0.2, 0.2], + "x2": [5.1, 4.9, 4.7], + "x3": [3.5, 3.0, 3.2], + }, + "model": {"file": "model.pkl"}, + "task": "tabular-classification", + } + }, + ), + ( + iris.data.values.tolist(), + "tabular-classification", + { + "sklearn": { + "columns": ["x0", "x1", "x2", "x3"], + "environment": ['scikit-learn="1.1.1"', "numpy"], + "example_input": { + "x0": [1.4, 1.4, 1.3], + "x1": [0.2, 0.2, 0.2], + "x2": [5.1, 4.9, 4.7], + "x3": [3.5, 3.0, 3.2], + }, + "model": {"file": "model.pkl"}, + "task": "tabular-classification", + } + }, + ), ( ["test", "text", "problem", "random"], "text-classification", @@ -169,6 +205,30 @@ def test_validate_folder(config_json): } }, ), + ( + np.array(["test", "text", "problem", "random"]), + "text-classification", + { + "sklearn": { + "environment": ['scikit-learn="1.1.1"', "numpy"], + "example_input": {"data": ["test", "text", "problem"]}, + "model": {"file": "model.pkl"}, + "task": "text-classification", + } + }, + ), + ( + (f"test{n}" for n in range(4)), + "text-classification", + { + "sklearn": { + "environment": ['scikit-learn="1.1.1"', "numpy"], + "example_input": {"data": ["test0", "test1", "test2"]}, + "model": {"file": "model.pkl"}, + "task": "text-classification", + } + }, + ), ], ) def test_create_config(data, task, expected_config): @@ -475,7 +535,7 @@ def test_update_env(repo_path, config_json): def test_get_example_input(): """Test the _get_example_input function.""" with pytest.raises( - ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray." + ValueError, match="The data must be convertible to a 2D numpy.ndarray." ): _get_example_input(["a", "b", "c"]) @@ -494,7 +554,7 @@ def test_get_example_input(): def test_get_column_names(): with pytest.raises( - ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray." + ValueError, match="The data must be convertible to a 2D numpy.ndarray." ): _get_column_names(["a", "b", "c"]) From 5d7e75d1a3b0de5a72b86912223cde95700d14a4 Mon Sep 17 00:00:00 2001 From: anferico Date: Tue, 11 Oct 2022 20:03:33 +0200 Subject: [PATCH 04/11] Revert some erroneous changes --- skops/hub_utils/_hf_hub.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index c393a184..17a99797 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -125,10 +125,9 @@ def _get_example_input(data): if isinstance(data, pd.DataFrame): return {x: data[x][:3].to_list() for x in data.columns} - except ImportError as e: - raise ValueError( - "The data cannot be a pandas.DataFrame because pandas is not installed." - ) from e + except ImportError: + # pandas is not installed, the data cannot be a pandas DataFrame + pass # here we convert the first three rows of the numpy array to a dict of lists # to be stored in the config file @@ -159,10 +158,9 @@ def _get_column_names(data): if isinstance(data, pd.DataFrame): return list(data.columns) - except ImportError as e: - raise ValueError( - "The data cannot be a pandas.DataFrame because pandas is not installed." - ) from e + except ImportError: + # pandas is not installed, the data cannot be a pandas DataFrame + pass data_array = _convert_to_2d_numpy_array(data) return [f"x{x}" for x in range(data_array.shape[1])] From ef89103d15e235f5e41a281fd4cc39d87c7d854e Mon Sep 17 00:00:00 2001 From: anferico Date: Tue, 11 Oct 2022 20:11:22 +0200 Subject: [PATCH 05/11] Update changelog --- docs/changes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changes.rst b/docs/changes.rst index ca2b49f8..e0c9c0c9 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -11,6 +11,7 @@ skops Changelog v0.3 ---- +- Support more array-like data types for tabular data and list-like data types for text data. :pr:`179` by `Francesco Cariaggi`_. - Utility function to add arbitrary files to be uploaded to the hub by using :func:`.hub_utils.add_files`. :pr:`123` by `Benjamin Bossan`_. - Add ``private`` as an optional argument to :meth:`.hub_utils.push` to From def297b600bba9c19a8bbf6140945e0802cdf45c Mon Sep 17 00:00:00 2001 From: anferico Date: Thu, 13 Oct 2022 18:33:41 +0200 Subject: [PATCH 06/11] Add myself as contributor --- docs/changes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changes.rst b/docs/changes.rst index e0c9c0c9..8f8fec47 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -56,4 +56,4 @@ Contributors ~~~~~~~~~~~~ :user:`Adrin Jalali `, :user:`Merve Noyan `, -:user:`Benjamin Bossan ` +:user:`Benjamin Bossan `, :user:`Francesco Cariaggi ` From 100169ef85c12fb0a465607fe98989ff2722ee60 Mon Sep 17 00:00:00 2001 From: anferico Date: Thu, 13 Oct 2022 21:10:00 +0200 Subject: [PATCH 07/11] Address PR review comments --- skops/hub_utils/_hf_hub.py | 68 ++++++++++++++++++++-------- skops/hub_utils/tests/test_hf_hub.py | 36 ++++++++++++++- 2 files changed, 84 insertions(+), 20 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 17a99797..5d6bc237 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -11,7 +11,7 @@ import shutil import warnings from pathlib import Path -from typing import Any, List, MutableMapping, Optional, Union +from typing import Any, Iterable, List, MutableMapping, Optional, Union import numpy as np from huggingface_hub import HfApi, InferenceApi, snapshot_download @@ -96,7 +96,7 @@ def _convert_to_2d_numpy_array(data): The numpy.ndarray object obtained by converting data. """ data_array = np.asarray(data) - if len(data_array.shape) != 2: + if data_array.ndim != 2: raise ValueError("The data must be convertible to a 2D numpy.ndarray.") return data_array @@ -131,8 +131,12 @@ def _get_example_input(data): # here we convert the first three rows of the numpy array to a dict of lists # to be stored in the config file - data_array = _convert_to_2d_numpy_array(data) - return {f"x{x}": data_array[:3, x].tolist() for x in range(data_array.shape[1])} + data_slice = data[:3] + data_slice_array = _convert_to_2d_numpy_array(data_slice) + return { + f"x{x}": data_slice_array[:3, x].tolist() + for x in range(data_slice_array.shape[1]) + } def _get_column_names(data): @@ -234,18 +238,48 @@ def recursively_default_dict() -> MutableMapping: config["sklearn"]["example_input"] = _get_example_input(data) config["sklearn"]["columns"] = _get_column_names(data) elif "text" in task: - if _is_iterable_of_strings(data): - config["sklearn"]["example_input"] = { - "data": list(itertools.islice(data, 3)) - } - else: - raise ValueError("The data needs to be an iterable of strings.") + error_message = "The data needs to be an iterable of strings." + try: + data_head = _head(data, n=3) + if _is_sequence_of_strings(data_head): + config["sklearn"]["example_input"] = {"data": data_head} + else: + raise ValueError(error_message) + except TypeError as e: + raise ValueError(error_message) from e dump_json(Path(dst) / "config.json", config) -def _is_iterable_of_strings(data): - """Checks whether data is an iterable of strings. +def _head(data: Iterable, n: int): + """Returns the first n elements of data. + + Raises a ``TypeError`` if data is not an iterable. + + Parameters + ---------- + data: Iterable + Any iterable. + + n: int + Number of elements to extract from the head of data. + + Raises + ------ + TypeError + If data is not an iterable (raised by itertools.islice). + + Returns + ------- + data_head: list + A list containing the first n elements of data. + """ + data, data_copy = itertools.tee(data, 2) + return list(itertools.islice(data_copy, n)) + + +def _is_sequence_of_strings(data): + """Checks whether data is a sequence of strings. Parameters ---------- @@ -254,18 +288,16 @@ def _is_iterable_of_strings(data): Returns ------- - is_iterable_of_strings: bool - A boolean variable indicating whether or not data is an iterable of + is_sequence_of_strings: bool + A boolean variable indicating whether or not data is a sequence of strings. """ if isinstance(data, str): return False try: - # needed in case data is an iterator or a generator - data, data_copy = itertools.tee(data, 2) - return all(isinstance(x, str) for x in data_copy) + return all(isinstance(x, str) for x in data) except TypeError: - # data is not iterable + # data isn't even iterable, can't be a sequence of strings return False diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index ea8ab507..7905d94f 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -32,6 +32,8 @@ _create_config, _get_column_names, _get_example_input, + _head, + _is_sequence_of_strings, _validate_folder, ) from skops.hub_utils.tests.common import HF_HUB_TOKEN @@ -256,7 +258,8 @@ def test_create_config(data, task, expected_config): ) -def test_create_config_invalid_text_data(temp_path): +@pytest.mark.parametrize("data", [[1, 2, 3], 420]) +def test_create_config_invalid_text_data(data, temp_path): with pytest.raises( ValueError, match="The data needs to be an iterable of strings." ): @@ -264,11 +267,40 @@ def test_create_config_invalid_text_data(temp_path): model_path="model.pkl", requirements=['scikit-learn="1.1.1"', "numpy"], task="text-classification", - data=[1, 2, 3], + data=data, dst=temp_path, ) +@pytest.mark.parametrize( + "data, n, expected_output", + [ + ([0, "1", 2, 3, 4], 3, [0, "1", 2]), + ((i for i in range(5)), 3, [0, 1, 2]), + ], +) +def test_head(data, n, expected_output): + assert _head(data, n) == expected_output + + +def test_head_invalid_iterable(): + with pytest.raises(TypeError): + _head(420) + + +@pytest.mark.parametrize( + "data, is_sequence_of_strings", + [ + ("sample text", False), + (["sample", 420], False), + (420, False), + (["sample", "text"], True), + ], +) +def test_is_sequence_of_strings(data, is_sequence_of_strings): + assert _is_sequence_of_strings(data) == is_sequence_of_strings + + def test_atomic_init(classifier_pickle, temp_path): with pytest.raises(ValueError): # this fails since we're passing an invalid task. From eab3c299d586c0a92c0b21e1756c2b722c92a065 Mon Sep 17 00:00:00 2001 From: anferico Date: Thu, 8 Dec 2022 11:08:10 +0100 Subject: [PATCH 08/11] Update changes.rst --- docs/changes.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/changes.rst b/docs/changes.rst index 01d7c1d3..70453a3f 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -17,7 +17,8 @@ v0.4 v0.3 ---- -- Support more array-like data types for tabular data and list-like data types for text data. :pr:`179` by `Francesco Cariaggi`_. +- Support more array-like data types for tabular data and list-like data types + for text data. :pr:`179` by `Francesco Cariaggi`_. - Utility function to add arbitrary files to be uploaded to the hub by using :func:`.hub_utils.add_files`. :pr:`123` by `Benjamin Bossan`_. - Add ``private`` as an optional argument to :meth:`skops.hub_utils.push` to From e6f5d2e1d555f3e598c1c818222012641cacbe32 Mon Sep 17 00:00:00 2001 From: anferico Date: Thu, 8 Dec 2022 11:09:32 +0100 Subject: [PATCH 09/11] Address PR comments --- skops/hub_utils/_hf_hub.py | 180 +++++++++++---------------- skops/hub_utils/tests/test_hf_hub.py | 175 +++++++++----------------- 2 files changed, 134 insertions(+), 221 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index e011c12e..5df8734d 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -14,6 +14,7 @@ import numpy as np from huggingface_hub import HfApi, InferenceApi, snapshot_download +from sklearn.utils import check_array from ..utils.fixes import Literal @@ -73,69 +74,89 @@ def _validate_folder(path: Union[str, Path]) -> None: raise TypeError(f"Model file {model_path} does not exist.") -def _convert_to_2d_numpy_array(data): - """Converts an array-like object to a 2D numpy.ndarray. +def _get_example_input_from_tabular_data(data): + """Returns the example input of a model for a tabular task. - Raises a ``ValueError`` if data cannot be converted to a 2D numpy.ndarray. + The input is converted into a dictionary which is then stored in the config + file. Parameters ---------- - data: pandas.DataFrame or array-like - Any object that can be converted to a 2D ``numpy.ndarray``, including - a ``pandas.DataFrame``. - - Raises - ------ - ValueError - Raised when the passed object cannot be converted to 2D numpy.ndarray. + data: array-like + The input needs to be either a ``pandas.DataFrame``, a 2D + ``numpy.ndarray`` or a list/tuple that can be converted to a 2D + ``numpy.ndarray``. The first 3 rows are used as example input. Returns ------- - data_array: numpy.ndarray - The numpy.ndarray object obtained by converting data. + example_input: dict of lists + The example input of the model as accepted by Hugging Face's backend. """ - data_array = np.asarray(data) - if data_array.ndim != 2: - raise ValueError("The data must be convertible to a 2D numpy.ndarray.") + try: + import pandas as pd + + if isinstance(data, pd.DataFrame): + return {x: data[x][:3].to_list() for x in data.columns} + except ImportError: + # pandas is not installed, the data cannot be a pandas DataFrame + pass - return data_array + # here we convert the first three rows of `data` to a dict of lists + # to be stored in the config file + if isinstance(data, (np.ndarray, list, tuple)): + data_slice = data[:3] + # This will raise a ValueError if the array is not 2D + data_slice_array = check_array(data_slice, ensure_2d=True) + return { + f"x{x}": data_slice_array[:, x].tolist() + for x in range(data_slice_array.shape[1]) + } + + raise ValueError( + "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a " + "list/tuple that can be converted to a 2D numpy.ndarray." + ) -def _get_example_input(data): - """Returns the example input of a model. +def _get_example_input_from_text_data(data: Iterable[str]): + """Returns the example input of a model for a text task. The input is converted into a dictionary which is then stored in the config file. Parameters ---------- - data: pandas.DataFrame or array-like - The input needs to be anything that can be converted to a 2D - ``numpy.ndarray``, including a ``pandas.DataFrame``. The first 3 rows - are used as example input. + data: Iterable[str] + An iterable of strings. The first 3 elements are used as example input. Returns ------- example_input: dict of lists The example input of the model as accepted by Hugging Face's backend. """ - try: - import pandas as pd - if isinstance(data, pd.DataFrame): - return {x: data[x][:3].to_list() for x in data.columns} - except ImportError: - # pandas is not installed, the data cannot be a pandas DataFrame - pass + def _head(data, n): + data, data_copy = itertools.tee(data, 2) + return list(itertools.islice(data_copy, n)) - # here we convert the first three rows of the numpy array to a dict of lists - # to be stored in the config file - data_slice = data[:3] - data_slice_array = _convert_to_2d_numpy_array(data_slice) - return { - f"x{x}": data_slice_array[:3, x].tolist() - for x in range(data_slice_array.shape[1]) - } + def _is_sequence_of_strings(data): + if isinstance(data, str): + return False + try: + return all(isinstance(x, str) for x in data) + except TypeError: + # data isn't even iterable, can't be a sequence of strings + return False + + error_message = "The data needs to be an iterable of strings." + try: + data_head = _head(data, n=3) + if _is_sequence_of_strings(data_head): + return {"data": data_head} + else: + raise ValueError(error_message) + except TypeError as e: + raise ValueError(error_message) from e def _get_column_names(data): @@ -146,14 +167,14 @@ def _get_column_names(data): Parameters ---------- - data: pandas.DataFrame or array-like + data: array-like The data whose columns names are to be returned. Must be a - ``pandas.DataFrame`` or anything that can be converted to a 2D - ``numpy.ndarray`` + ``pandas.DataFrame``, a 2D ``numpy.ndarray`` or a list/tuple that can + be converted to a 2D ``numpy.ndarray``. Returns ------- - columns: list of tuples + columns: list of strings A list of strings. Each string is a column name. """ try: @@ -165,8 +186,17 @@ def _get_column_names(data): # pandas is not installed, the data cannot be a pandas DataFrame pass - data_array = _convert_to_2d_numpy_array(data) - return [f"x{x}" for x in range(data_array.shape[1])] + # TODO: this is going to fail for Structured Arrays. We can add support for + # them later if we see need for it. + if isinstance(data, (np.ndarray, list, tuple)): + # This will raise a ValueError if the array is not 2D + data_array = check_array(data, ensure_2d=True) + return [f"x{x}" for x in range(data_array.shape[1])] + + raise ValueError( + "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a " + "list/tuple that can be converted to a 2D numpy.ndarray." + ) def _create_config( @@ -234,72 +264,14 @@ def recursively_default_dict() -> MutableMapping: config["sklearn"]["task"] = task if "tabular" in task: - config["sklearn"]["example_input"] = _get_example_input(data) + config["sklearn"]["example_input"] = _get_example_input_from_tabular_data(data) config["sklearn"]["columns"] = _get_column_names(data) elif "text" in task: - error_message = "The data needs to be an iterable of strings." - try: - data_head = _head(data, n=3) - if _is_sequence_of_strings(data_head): - config["sklearn"]["example_input"] = {"data": data_head} - else: - raise ValueError(error_message) - except TypeError as e: - raise ValueError(error_message) from e + config["sklearn"]["example_input"] = _get_example_input_from_text_data(data) dump_json(Path(dst) / "config.json", config) -def _head(data: Iterable, n: int): - """Returns the first n elements of data. - - Raises a ``TypeError`` if data is not an iterable. - - Parameters - ---------- - data: Iterable - Any iterable. - - n: int - Number of elements to extract from the head of data. - - Raises - ------ - TypeError - If data is not an iterable (raised by itertools.islice). - - Returns - ------- - data_head: list - A list containing the first n elements of data. - """ - data, data_copy = itertools.tee(data, 2) - return list(itertools.islice(data_copy, n)) - - -def _is_sequence_of_strings(data): - """Checks whether data is a sequence of strings. - - Parameters - ---------- - data: Any - Any object. - - Returns - ------- - is_sequence_of_strings: bool - A boolean variable indicating whether or not data is a sequence of - strings. - """ - if isinstance(data, str): - return False - try: - return all(isinstance(x, str) for x in data) - except TypeError: - # data isn't even iterable, can't be a sequence of strings - return False - - def _check_model_file(path: str | Path) -> Path: """Perform sanity checks on the model file diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 068e8c27..2252e684 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -32,9 +32,8 @@ from skops.hub_utils._hf_hub import ( _create_config, _get_column_names, - _get_example_input, - _head, - _is_sequence_of_strings, + _get_example_input_from_tabular_data, + _get_example_input_from_text_data, _validate_folder, ) from skops.hub_utils.tests.common import HF_HUB_TOKEN @@ -162,42 +161,6 @@ def test_validate_folder(config_json): } }, ), - ( - iris.data.values, - "tabular-classification", - { - "sklearn": { - "columns": ["x0", "x1", "x2", "x3"], - "environment": ['scikit-learn="1.1.1"', "numpy"], - "example_input": { - "x0": [1.4, 1.4, 1.3], - "x1": [0.2, 0.2, 0.2], - "x2": [5.1, 4.9, 4.7], - "x3": [3.5, 3.0, 3.2], - }, - "model": {"file": "model.pkl"}, - "task": "tabular-classification", - } - }, - ), - ( - iris.data.values.tolist(), - "tabular-classification", - { - "sklearn": { - "columns": ["x0", "x1", "x2", "x3"], - "environment": ['scikit-learn="1.1.1"', "numpy"], - "example_input": { - "x0": [1.4, 1.4, 1.3], - "x1": [0.2, 0.2, 0.2], - "x2": [5.1, 4.9, 4.7], - "x3": [3.5, 3.0, 3.2], - }, - "model": {"file": "model.pkl"}, - "task": "tabular-classification", - } - }, - ), ( ["test", "text", "problem", "random"], "text-classification", @@ -210,30 +173,6 @@ def test_validate_folder(config_json): } }, ), - ( - np.array(["test", "text", "problem", "random"]), - "text-classification", - { - "sklearn": { - "environment": ['scikit-learn="1.1.1"', "numpy"], - "example_input": {"data": ["test", "text", "problem"]}, - "model": {"file": "model.pkl"}, - "task": "text-classification", - } - }, - ), - ( - (f"test{n}" for n in range(4)), - "text-classification", - { - "sklearn": { - "environment": ['scikit-learn="1.1.1"', "numpy"], - "example_input": {"data": ["test0", "test1", "test2"]}, - "model": {"file": "model.pkl"}, - "task": "text-classification", - } - }, - ), ], ) def test_create_config(data, task, expected_config): @@ -261,49 +200,6 @@ def test_create_config(data, task, expected_config): ) -@pytest.mark.parametrize("data", [[1, 2, 3], 420]) -def test_create_config_invalid_text_data(data, temp_path): - with pytest.raises( - ValueError, match="The data needs to be an iterable of strings." - ): - _create_config( - model_path="model.pkl", - requirements=['scikit-learn="1.1.1"', "numpy"], - task="text-classification", - data=data, - dst=temp_path, - ) - - -@pytest.mark.parametrize( - "data, n, expected_output", - [ - ([0, "1", 2, 3, 4], 3, [0, "1", 2]), - ((i for i in range(5)), 3, [0, 1, 2]), - ], -) -def test_head(data, n, expected_output): - assert _head(data, n) == expected_output - - -def test_head_invalid_iterable(): - with pytest.raises(TypeError): - _head(420) - - -@pytest.mark.parametrize( - "data, is_sequence_of_strings", - [ - ("sample text", False), - (["sample", 420], False), - (420, False), - (["sample", "text"], True), - ], -) -def test_is_sequence_of_strings(data, is_sequence_of_strings): - assert _is_sequence_of_strings(data) == is_sequence_of_strings - - def test_atomic_init(classifier_pickle, temp_path): with pytest.raises(ValueError): # this fails since we're passing an invalid task. @@ -571,30 +467,75 @@ def test_update_env(repo_path, config_json): assert get_requirements(repo_path) == ['scikit-learn="1.1.2"'] -def test_get_example_input(): - """Test the _get_example_input function.""" +def test_get_example_input_from_tabular_data(): with pytest.raises( - ValueError, match="The data must be convertible to a 2D numpy.ndarray." + ValueError, + match=( + "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a " + "list/tuple that can be converted to a 2D numpy.ndarray." + ), ): - _get_example_input(["a", "b", "c"]) + _get_example_input_from_tabular_data("random") + with pytest.raises(ValueError): + _get_example_input_from_tabular_data(["a", "b", "c"]) - examples = _get_example_input(np.ones((5, 10))) - # the result if a dictionary of column name: list of values + examples = _get_example_input_from_tabular_data(np.ones((5, 10))) + # the result is a dictionary of column name: list of values assert len(examples) == 10 assert len(examples["x0"]) == 3 - examples = _get_example_input( + examples = _get_example_input_from_tabular_data(np.ones((5, 10)).tolist()) + # the result is a dictionary of column name: list of values + assert len(examples) == 10 + assert len(examples["x0"]) == 3 + + examples = _get_example_input_from_tabular_data( pd.DataFrame(np.ones((5, 10)), columns=[f"column{x}" for x in range(10)]) ) - # the result if a dictionary of column name: list of values + # the result is a dictionary of column name: list of values assert len(examples) == 10 assert len(examples["column0"]) == 3 +def test_get_example_input_from_text_data(): + examples = _get_example_input_from_text_data(["a", "b", "c", "d"]) + assert len(examples) == 3 + + examples = _get_example_input_from_text_data(np.array(["a", "b", "c", "d"])) + assert len(examples) == 3 + + examples = _get_example_input_from_text_data((c for c in ["a", "b", "c", "d"])) + assert len(examples) == 3 + + examples = _get_example_input_from_text_data([]) + assert len(examples) == 0 + + +@pytest.mark.parametrize("data", ["random", [1, 2, 3], 420]) +def test_get_example_input_from_text_data_invalid_text_data(data): + with pytest.raises( + ValueError, match="The data needs to be an iterable of strings." + ): + _get_example_input_from_text_data(data) + + +def test_get_example_input_from_text_data_generator_not_exhausted(): + generator = (f"s{x}" for x in range(3)) + _get_example_input_from_text_data(generator) + # check that next() doesn't raise a StopIteration + next(generator) + + def test_get_column_names(): with pytest.raises( - ValueError, match="The data must be convertible to a 2D numpy.ndarray." + ValueError, + match=( + "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a " + "list/tuple that can be converted to a 2D numpy.ndarray." + ), ): + _get_column_names("random") + with pytest.raises(ValueError): _get_column_names(["a", "b", "c"]) X_array = np.ones((5, 10), dtype=np.float32) @@ -606,11 +547,11 @@ def test_get_column_names(): assert _get_column_names(X_df) == expected_columns -def test_get_example_input_pandas_not_installed(pandas_not_installed): +def test_get_example_input_from_tabular_data_pandas_not_installed(pandas_not_installed): # use pandas_not_installed fixture from conftest.py to pretend that pandas # is not installed and check that the function does not raise when pandas # import fails - _get_example_input(np.ones((5, 10))) + _get_example_input_from_tabular_data(np.ones((5, 10))) def test_get_column_names_pandas_not_installed(pandas_not_installed): From ecbdf385bd241ab076f6080b6261e95bd9176a3d Mon Sep 17 00:00:00 2001 From: anferico Date: Sun, 8 Jan 2023 19:17:58 +0100 Subject: [PATCH 10/11] Remove support for generators in text tasks + small fixes --- docs/changes.rst | 4 ++-- skops/conftest.py | 4 +++- skops/hub_utils/_hf_hub.py | 23 ++++++++++------------ skops/hub_utils/tests/test_hf_hub.py | 29 ++++++++++------------------ 4 files changed, 25 insertions(+), 35 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index 60b26541..a66a68c0 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -26,11 +26,11 @@ v0.4 section/New section": "content"})`` to add "content" a new subsection called "New section" to an existing section called "Existing section". :pr:`203` by `Benjamin Bossan`_. +- Support more array-like data types for tabular data and list-like data types + for text data. :pr:`179` by `Francesco Cariaggi`_. v0.3 ---- -- Support more array-like data types for tabular data and list-like data types - for text data. :pr:`179` by `Francesco Cariaggi`_. - Utility function to add arbitrary files to be uploaded to the hub by using :func:`.hub_utils.add_files`. :pr:`123` by `Benjamin Bossan`_. - Add ``private`` as an optional argument to :meth:`skops.hub_utils.push` to diff --git a/skops/conftest.py b/skops/conftest.py index a09c69b8..4dcaed83 100644 --- a/skops/conftest.py +++ b/skops/conftest.py @@ -7,10 +7,12 @@ def pandas_not_installed(): # patch import so that it raises an ImportError when trying to import # pandas. This works because pandas is only imported lazily. + orig_import = __import__ + def mock_import(name, *args, **kwargs): if name == "pandas": raise ImportError - return __import__(name, *args, **kwargs) + return orig_import(name, *args, **kwargs) with patch("builtins.__import__", side_effect=mock_import): yield diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 3b3135de..76226912 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -10,14 +10,12 @@ import os import shutil from pathlib import Path -from typing import Any, List, MutableMapping, Optional, Sequence, Union +from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union import numpy as np from huggingface_hub import HfApi, InferenceApi, snapshot_download from sklearn.utils import check_array -from ..utils.fixes import Literal - SUPPORTED_TASKS = [ "tabular-classification", "tabular-regression", @@ -127,7 +125,7 @@ def _get_example_input_from_text_data(data: Sequence[str]): Parameters ---------- data: Sequence[str] - An sequence of strings. The first 3 elements are used as example input. + A sequence of strings. The first 3 elements are used as example input. Returns ------- @@ -136,17 +134,16 @@ def _get_example_input_from_text_data(data: Sequence[str]): """ def _head(data, n): - data, data_copy = itertools.tee(data, 2) - return list(itertools.islice(data_copy, n)) + def is_subscriptable(data): + return hasattr(data, "__getitem__") + + if is_subscriptable(data): + return data[:n] + + return list(itertools.islice(data, n)) def _is_sequence_of_strings(data): - if isinstance(data, str): - return False - try: - return all(isinstance(x, str) for x in data) - except TypeError: - # data isn't even iterable, can't be a sequence of strings - return False + return not isinstance(data, str) and all(isinstance(x, str) for x in data) error_message = "The data needs to be a sequence of strings." try: diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 7687e5e6..1abbad2b 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -231,7 +231,7 @@ def test_create_config(data, task, expected_config): def test_create_config_invalid_text_data(temp_path): - with pytest.raises(ValueError, match="The data needs to be a list of strings."): + with pytest.raises(ValueError, match="The data needs to be a sequence of strings."): _create_config( model_path="model.pkl", requirements=['scikit-learn="1.1.1"', "numpy"], @@ -553,34 +553,25 @@ def test_get_example_input_from_tabular_data(): def test_get_example_input_from_text_data(): - examples = _get_example_input_from_text_data(["a", "b", "c", "d"]) - assert len(examples) == 3 + example_input = _get_example_input_from_text_data(["a", "b", "c", "d"]) + assert len(example_input["data"]) == 3 - examples = _get_example_input_from_text_data(np.array(["a", "b", "c", "d"])) - assert len(examples) == 3 + example_input = _get_example_input_from_text_data(np.array(["a", "b", "c", "d"])) + assert len(example_input["data"]) == 3 - examples = _get_example_input_from_text_data((c for c in ["a", "b", "c", "d"])) - assert len(examples) == 3 + example_input = _get_example_input_from_text_data(set(["a", "b", "c", "d"])) + assert len(example_input["data"]) == 3 - examples = _get_example_input_from_text_data([]) - assert len(examples) == 0 + example_input = _get_example_input_from_text_data([]) + assert len(example_input["data"]) == 0 @pytest.mark.parametrize("data", ["random", [1, 2, 3], 420]) def test_get_example_input_from_text_data_invalid_text_data(data): - with pytest.raises( - ValueError, match="The data needs to be an iterable of strings." - ): + with pytest.raises(ValueError, match="The data needs to be a sequence of strings."): _get_example_input_from_text_data(data) -def test_get_example_input_from_text_data_generator_not_exhausted(): - generator = (f"s{x}" for x in range(3)) - _get_example_input_from_text_data(generator) - # check that next() doesn't raise a StopIteration - next(generator) - - def test_get_column_names(): with pytest.raises( ValueError, From 6b41b6935d7cd70f1e3198e7232724939c6ee6a7 Mon Sep 17 00:00:00 2001 From: anferico Date: Thu, 19 Jan 2023 18:37:25 +0100 Subject: [PATCH 11/11] Add new section to changelog (v0.5) --- docs/changes.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index a66a68c0..4e21b8ea 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -9,6 +9,11 @@ skops Changelog :depth: 1 :local: +v0.5 +---- +- Support more array-like data types for tabular data and list-like data types + for text data. :pr:`179` by `Francesco Cariaggi`_. + v0.4 ---- - :func:`.io.dump` and :func:`.io.load` now work with file like objects, @@ -26,8 +31,6 @@ v0.4 section/New section": "content"})`` to add "content" a new subsection called "New section" to an existing section called "Existing section". :pr:`203` by `Benjamin Bossan`_. -- Support more array-like data types for tabular data and list-like data types - for text data. :pr:`179` by `Francesco Cariaggi`_. v0.3 ----