skops-dev · adrinjalali · Jan 20, 2023 · Oct 9, 2022 · Oct 11, 2022 · Oct 11, 2022
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -9,6 +9,11 @@ skops Changelog
     :depth: 1
     :local:
 
+v0.5
+----
+- Support more array-like data types for tabular data and list-like data types
+  for text data. :pr:`179` by `Francesco Cariaggi`_.
+
 v0.4
 ----
 - :func:`.io.dump` and :func:`.io.load` now work with file like objects,
@@ -83,4 +88,4 @@ Contributors
 
 :user:`Adrin Jalali <adrinjalali>`, :user:`Merve Noyan <merveenoyan>`,
 :user:`Benjamin Bossan <BenjaminBossan>`, :user:`Ayyuce Demirbas
-<ayyucedemirbas>`, :user:`Prajjwal Mishra <p-mishra1>`
+<ayyucedemirbas>`, :user:`Prajjwal Mishra <p-mishra1>`, :user:`Francesco Cariaggi <anferico>`
diff --git a/skops/conftest.py b/skops/conftest.py
@@ -7,10 +7,12 @@
 def pandas_not_installed():
     # patch import so that it raises an ImportError when trying to import
     # pandas. This works because pandas is only imported lazily.
+    orig_import = __import__
+
     def mock_import(name, *args, **kwargs):
         if name == "pandas":
             raise ImportError
-        return __import__(name, *args, **kwargs)
+        return orig_import(name, *args, **kwargs)
 
     with patch("builtins.__import__", side_effect=mock_import):
         yield
diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
@@ -5,14 +5,16 @@
 from __future__ import annotations
 
 import collections
+import itertools
 import json
 import os
 import shutil
 from pathlib import Path
-from typing import Any, List, Literal, MutableMapping, Optional, Union
+from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union
 
 import numpy as np
 from huggingface_hub import HfApi, InferenceApi, snapshot_download
+from sklearn.utils import check_array
 
 SUPPORTED_TASKS = [
     "tabular-classification",
@@ -70,16 +72,17 @@ def _validate_folder(path: Union[str, Path]) -> None:
         raise TypeError(f"Model file {model_path} does not exist.")
 
 
-def _get_example_input(data):
-    """Returns the example input of a model.
+def _get_example_input_from_tabular_data(data):
+    """Returns the example input of a model for a tabular task.
 
     The input is converted into a dictionary which is then stored in the config
     file.
 
     Parameters
     ----------
     data: array-like
-        The input needs to be either a ``pandas.DataFrame`` or a
+        The input needs to be either a ``pandas.DataFrame``, a 2D
+        ``numpy.ndarray`` or a list/tuple that can be converted to a 2D
         ``numpy.ndarray``. The first 3 rows are used as example input.
 
     Returns
@@ -96,28 +99,77 @@ def _get_example_input(data):
         # pandas is not installed, the data cannot be a pandas DataFrame
         pass
 
-    # here we convert the first three rows of the numpy array to a dict of lists
+    # here we convert the first three rows of `data` to a dict of lists
     # to be stored in the config file
-    if isinstance(data, np.ndarray):
-        return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])}
+    if isinstance(data, (np.ndarray, list, tuple)):
+        data_slice = data[:3]
+        # This will raise a ValueError if the array is not 2D
+        data_slice_array = check_array(data_slice, ensure_2d=True)
+        return {
+            f"x{x}": data_slice_array[:, x].tolist()
+            for x in range(data_slice_array.shape[1])
+        }
+
+    raise ValueError(
+        "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
+        "list/tuple that can be converted to a 2D numpy.ndarray."
+    )
+
+
+def _get_example_input_from_text_data(data: Sequence[str]):
+    """Returns the example input of a model for a text task.
+
+    The input is converted into a dictionary which is then stored in the config
+    file.
+
+    Parameters
+    ----------
+    data: Sequence[str]
+        A sequence of strings. The first 3 elements are used as example input.
+
+    Returns
+    -------
+    example_input: dict of lists
+        The example input of the model as accepted by Hugging Face's backend.
+    """
+
+    def _head(data, n):
+        is_data_subscriptable = hasattr(data, "__getitem__")
+        if is_data_subscriptable:
+            return data[:n]
+
+        return list(itertools.islice(data, n))
 
-    raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")
+    def _is_sequence_of_strings(data):
+        return not isinstance(data, str) and all(isinstance(x, str) for x in data)
+
+    error_message = "The data needs to be a sequence of strings."
+    try:
+        data_head = _head(data, n=3)
+        if _is_sequence_of_strings(data_head):
+            return {"data": data_head}
+        else:
+            raise ValueError(error_message)
+    except TypeError as e:
+        raise ValueError(error_message) from e
 
 
 def _get_column_names(data):
     """Returns the column names of the input.
 
-    If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to
-    ``xn-1``, where ``n`` is the number of columns.
+    If data is not a ``pandas.DataFrame``, column names are assumed to be
+    ``x0`` to ``xn-1``, where ``n`` is the number of columns.
 
     Parameters
     ----------
-    data: pandas.DataFrame or numpy.ndarray
-        The data whose columns names are to be returned.
+    data: array-like
+        The data whose columns names are to be returned. Must be a
+        ``pandas.DataFrame``, a 2D ``numpy.ndarray`` or a list/tuple that can
+        be converted to a 2D ``numpy.ndarray``.
 
     Returns
     -------
-    columns: list of tuples
+    columns: list of strings
         A list of strings. Each string is a column name.
     """
     try:
@@ -131,10 +183,15 @@ def _get_column_names(data):
 
     # TODO: this is going to fail for Structured Arrays. We can add support for
     # them later if we see need for it.
-    if isinstance(data, np.ndarray):
-        return [f"x{x}" for x in range(data.shape[1])]
-
-    raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")
+    if isinstance(data, (np.ndarray, list, tuple)):
+        # This will raise a ValueError if the array is not 2D
+        data_array = check_array(data, ensure_2d=True)
+        return [f"x{x}" for x in range(data_array.shape[1])]
+
+    raise ValueError(
+        "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
+        "list/tuple that can be converted to a 2D numpy.ndarray."
+    )
 
 
 def _create_config(
@@ -176,7 +233,7 @@ def _create_config(
         the model. It can be one of: ``tabular-classification``,
         ``tabular-regression``, ``text-classification``, ``text-regression``.
 
-    data: array-like
+    data: array-like or sequence
         The input to the model. This is used for two purposes:
 
             1. Save an example input to the model, which is used by
@@ -186,7 +243,10 @@ def _create_config(
                HuggingFace's backend to pass the data in the right form to the
                model.
 
-        The first 3 input values are used as example inputs.
+        The first 3 input values are used as example inputs. If the task is
+        ``tabular-classification`` or ``tabular-regression``, then data is
+        expected to be an array-like. Otherwise, it is expected to be an
+        sequence of strings.
 
     model_format: str
         The format used to persist the model. Can be ``"auto"``, ``"skops"``
@@ -223,13 +283,10 @@ def recursively_default_dict() -> MutableMapping:
     config["sklearn"]["model_format"] = model_format
 
     if "tabular" in task:
-        config["sklearn"]["example_input"] = _get_example_input(data)
+        config["sklearn"]["example_input"] = _get_example_input_from_tabular_data(data)
         config["sklearn"]["columns"] = _get_column_names(data)
     elif "text" in task:
-        if isinstance(data, list) and all(isinstance(x, str) for x in data):
-            config["sklearn"]["example_input"] = {"data": data[:3]}
-        else:
-            raise ValueError("The data needs to be a list of strings.")
+        config["sklearn"]["example_input"] = _get_example_input_from_text_data(data)
 
     dump_json(Path(dst) / "config.json", config)
 

diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py
@@ -33,7 +33,8 @@
 from skops.hub_utils._hf_hub import (
     _create_config,
     _get_column_names,
-    _get_example_input,
+    _get_example_input_from_tabular_data,
+    _get_example_input_from_text_data,
     _validate_folder,
 )
 from skops.hub_utils.tests.common import HF_HUB_TOKEN
@@ -230,7 +231,7 @@ def test_create_config(data, task, expected_config):
 
 
 def test_create_config_invalid_text_data(temp_path):
-    with pytest.raises(ValueError, match="The data needs to be a list of strings."):
+    with pytest.raises(ValueError, match="The data needs to be a sequence of strings."):
         _create_config(
             model_path="model.pkl",
             requirements=['scikit-learn="1.1.1"', "numpy"],
@@ -521,30 +522,68 @@ def test_update_env(repo_path, config_json):
     assert get_requirements(repo_path) == ['scikit-learn="1.1.2"']
 
 
-def test_get_example_input():
-    """Test the _get_example_input function."""
+def test_get_example_input_from_tabular_data():
     with pytest.raises(
-        ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray."
+        ValueError,
+        match=(
+            "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
+            "list/tuple that can be converted to a 2D numpy.ndarray."
+        ),
     ):
-        _get_example_input(["a", "b", "c"])
+        _get_example_input_from_tabular_data("random")
+    with pytest.raises(ValueError):
+        _get_example_input_from_tabular_data(["a", "b", "c"])
+
+    examples = _get_example_input_from_tabular_data(np.ones((5, 10)))
+    # the result is a dictionary of column name: list of values
+    assert len(examples) == 10
+    assert len(examples["x0"]) == 3
 
-    examples = _get_example_input(np.ones((5, 10)))
-    # the result if a dictionary of column name: list of values
+    examples = _get_example_input_from_tabular_data(np.ones((5, 10)).tolist())
+    # the result is a dictionary of column name: list of values
     assert len(examples) == 10
     assert len(examples["x0"]) == 3
 
-    examples = _get_example_input(
+    examples = _get_example_input_from_tabular_data(
         pd.DataFrame(np.ones((5, 10)), columns=[f"column{x}" for x in range(10)])
     )
-    # the result if a dictionary of column name: list of values
+    # the result is a dictionary of column name: list of values
     assert len(examples) == 10
     assert len(examples["column0"]) == 3
 
 
+@pytest.mark.parametrize(
+    "data, expected_length",
+    [
+        (["a", "b", "c", "d"], 3),
+        (np.array(["a", "b", "c", "d"]), 3),
+        (set(["a", "b", "c", "d"]), 3),
+        (tuple(["a", "b", "c", "d"]), 3),
+        (["a"], 1),
+        ([], 0),
+    ],
+)
+def test_get_example_input_from_text_data(data, expected_length):
+    example_input = _get_example_input_from_text_data(data)
+    assert len(example_input["data"]) == expected_length
+
+
+@pytest.mark.parametrize("data", ["random", [1, 2, 3], 420])
+def test_get_example_input_from_text_data_invalid_text_data(data):
+    with pytest.raises(ValueError, match="The data needs to be a sequence of strings."):
+        _get_example_input_from_text_data(data)
+
+
 def test_get_column_names():
     with pytest.raises(
-        ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray."
+        ValueError,
+        match=(
+            "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
+            "list/tuple that can be converted to a 2D numpy.ndarray."
+        ),
     ):
+        _get_column_names("random")
+    with pytest.raises(ValueError):
         _get_column_names(["a", "b", "c"])
 
     X_array = np.ones((5, 10), dtype=np.float32)
@@ -556,11 +595,11 @@ def test_get_column_names():
     assert _get_column_names(X_df) == expected_columns
 
 
-def test_get_example_input_pandas_not_installed(pandas_not_installed):
+def test_get_example_input_from_tabular_data_pandas_not_installed(pandas_not_installed):
     # use pandas_not_installed fixture from conftest.py to pretend that pandas
     # is not installed and check that the function does not raise when pandas
     # import fails
-    _get_example_input(np.ones((5, 10)))
+    _get_example_input_from_tabular_data(np.ones((5, 10)))
 
 
 def test_get_column_names_pandas_not_installed(pandas_not_installed):