From 782b9e083db1cbaef4de2a3ffdab1541e01581e6 Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Sun, 9 Oct 2022 17:41:06 +0200
Subject: [PATCH 01/11] add support for more data types

---
 skops/hub_utils/_hf_hub.py           | 81 ++++++++++++++++++----------
 skops/hub_utils/tests/test_hf_hub.py |  4 +-
 2 files changed, 56 insertions(+), 29 deletions(-)

diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
index 5d7a629b..708c0bf1 100644
--- a/skops/hub_utils/_hf_hub.py
+++ b/skops/hub_utils/_hf_hub.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import collections
+import itertools
 import json
 import os
 import shutil
@@ -73,6 +74,14 @@ def _validate_folder(path: Union[str, Path]) -> None:
         raise TypeError(f"Model file {model_path} does not exist.")
 
 
+def _convert_to_2d_numpy_array(data):
+    data_array = np.asarray(data)
+    if len(data_array.shape) != 2:
+        raise ValueError("The data must be convertible to a 2D numpy.ndarray.")
+
+    return data_array
+
+
 def _get_example_input(data):
     """Returns the example input of a model.
 
@@ -81,9 +90,10 @@ def _get_example_input(data):
 
     Parameters
     ----------
-    data: array-like
-        The input needs to be either a ``pandas.DataFrame`` or a
-        ``numpy.ndarray``. The first 3 rows are used as example input.
+    data: pandas.DataFrame or array-like
+        The input needs to be anything that can be converted to a 2D
+        ``numpy.ndarray``, including a ``pandas.DataFrame``. The first 3 rows
+        are used as example input.
 
     Returns
     -------
@@ -95,28 +105,29 @@ def _get_example_input(data):
 
         if isinstance(data, pd.DataFrame):
             return {x: data[x][:3].to_list() for x in data.columns}
-    except ImportError:
-        # pandas is not installed, the data cannot be a pandas DataFrame
-        pass
+    except ImportError as e:
+        raise ValueError(
+            "The data cannot be a pandas.DataFrame because pandas is not installed."
+        ) from e
 
     # here we convert the first three rows of the numpy array to a dict of lists
     # to be stored in the config file
-    if isinstance(data, np.ndarray):
-        return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])}
-
-    raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")
+    data_array = _convert_to_2d_numpy_array(data)
+    return {f"x{x}": data_array[:3, x].tolist() for x in range(data_array.shape[1])}
 
 
 def _get_column_names(data):
     """Returns the column names of the input.
 
-    If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to
-    ``xn-1``, where ``n`` is the number of columns.
+    If data is not a ``pandas.DataFrame``, column names are assumed to be
+    ``x0`` to ``xn-1``, where ``n`` is the number of columns.
 
     Parameters
     ----------
-    data: pandas.DataFrame or numpy.ndarray
-        The data whose columns names are to be returned.
+    data: pandas.DataFrame or array-like
+        The data whose columns names are to be returned. Must be a
+        ``pandas.DataFrame`` or anything that can be converted to a 2D
+        ``numpy.ndarray``
 
     Returns
     -------
@@ -128,16 +139,13 @@ def _get_column_names(data):
 
         if isinstance(data, pd.DataFrame):
             return list(data.columns)
-    except ImportError:
-        # pandas is not installed, the data cannot be a pandas DataFrame
-        pass
-
-    # TODO: this is going to fail for Structured Arrays. We can add support for
-    # them later if we see need for it.
-    if isinstance(data, np.ndarray):
-        return [f"x{x}" for x in range(data.shape[1])]
+    except ImportError as e:
+        raise ValueError(
+            "The data cannot be a pandas.DataFrame because pandas is not installed."
+        ) from e
 
-    raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.")
+    data_array = _convert_to_2d_numpy_array(data)
+    return [f"x{x}" for x in range(data_array.shape[1])]
 
 
 def _create_config(
@@ -174,7 +182,7 @@ def _create_config(
         the model. It can be one of: ``tabular-classification``,
         ``tabular-regression``, ``text-classification``, ``text-regression``.
 
-    data: array-like
+    data: array-like, or iterable
         The input to the model. This is used for two purposes:
 
             1. Save an example input to the model, which is used by
@@ -184,7 +192,10 @@ def _create_config(
                HuggingFace's backend to pass the data in the right form to the
                model.
 
-        The first 3 input values are used as example inputs.
+        The first 3 input values are used as example inputs. If the task is
+        ``tabular-classification`` or ``tabular-regression``, then data is
+        expected to be an array-like. Otherwise, it is expected to be an
+        iterable of strings.
 
     Returns
     -------
@@ -205,14 +216,28 @@ def recursively_default_dict() -> MutableMapping:
         config["sklearn"]["example_input"] = _get_example_input(data)
         config["sklearn"]["columns"] = _get_column_names(data)
     elif "text" in task:
-        if isinstance(data, list) and all(isinstance(x, str) for x in data):
-            config["sklearn"]["example_input"] = {"data": data[:3]}
+        if _is_iterable_of_strings(data):
+            config["sklearn"]["example_input"] = {
+                "data": list(itertools.islice(data, 3))
+            }
         else:
-            raise ValueError("The data needs to be a list of strings.")
+            raise ValueError("The data needs to be an iterable of strings.")
 
     dump_json(Path(dst) / "config.json", config)
 
 
+def _is_iterable_of_strings(data):
+    if isinstance(data, str):
+        return False
+    try:
+        # needed in case data is an iterator or a generator
+        data, data_copy = itertools.tee(data, 2)
+        return all(isinstance(x, str) for x in data_copy)
+    except TypeError:
+        # data is not iterable
+        return False
+
+
 def _check_model_file(path: str | Path) -> Path:
     """Perform sanity checks on the model file
 
diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py
index 362c2664..346cb7a7 100644
--- a/skops/hub_utils/tests/test_hf_hub.py
+++ b/skops/hub_utils/tests/test_hf_hub.py
@@ -197,7 +197,9 @@ def test_create_config(data, task, expected_config):
 
 
 def test_create_config_invalid_text_data(temp_path):
-    with pytest.raises(ValueError, match="The data needs to be a list of strings."):
+    with pytest.raises(
+        ValueError, match="The data needs to be an iterable of strings."
+    ):
         _create_config(
             model_path="model.pkl",
             requirements=['scikit-learn="1.1.1"', "numpy"],

From fe798d89091cc256bdfcf1b3ab8f06251e1467e7 Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Tue, 11 Oct 2022 18:25:38 +0200
Subject: [PATCH 02/11] Add docstrings

---
 skops/hub_utils/_hf_hub.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
index 708c0bf1..c393a184 100644
--- a/skops/hub_utils/_hf_hub.py
+++ b/skops/hub_utils/_hf_hub.py
@@ -75,6 +75,26 @@ def _validate_folder(path: Union[str, Path]) -> None:
 
 
 def _convert_to_2d_numpy_array(data):
+    """Converts an array-like object to a 2D numpy.ndarray.
+
+    Raises a ``ValueError`` if data cannot be converted to a 2D numpy.ndarray.
+
+    Parameters
+    ----------
+    data: pandas.DataFrame or array-like
+        Any object that can be converted to a 2D ``numpy.ndarray``, including
+        a ``pandas.DataFrame``.
+
+    Raises
+    ------
+    ValueError
+        Raised when the passed object cannot be converted to 2D numpy.ndarray.
+
+    Returns
+    -------
+    data_array: numpy.ndarray
+        The numpy.ndarray object obtained by converting data.
+    """
     data_array = np.asarray(data)
     if len(data_array.shape) != 2:
         raise ValueError("The data must be convertible to a 2D numpy.ndarray.")
@@ -227,6 +247,19 @@ def recursively_default_dict() -> MutableMapping:
 
 
 def _is_iterable_of_strings(data):
+    """Checks whether data is an iterable of strings.
+
+    Parameters
+    ----------
+    data: Any
+        Any object.
+
+    Returns
+    -------
+    is_iterable_of_strings: bool
+        A boolean variable indicating whether or not data is an iterable of
+        strings.
+    """
     if isinstance(data, str):
         return False
     try:

From a8f5e4d43ad464b6da8e301e913baffb21b7ce99 Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Tue, 11 Oct 2022 20:02:53 +0200
Subject: [PATCH 03/11] Update unit tests

---
 skops/hub_utils/tests/test_hf_hub.py | 64 +++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py
index 346cb7a7..ea8ab507 100644
--- a/skops/hub_utils/tests/test_hf_hub.py
+++ b/skops/hub_utils/tests/test_hf_hub.py
@@ -157,6 +157,42 @@ def test_validate_folder(config_json):
                 }
             },
         ),
+        (
+            iris.data.values,
+            "tabular-classification",
+            {
+                "sklearn": {
+                    "columns": ["x0", "x1", "x2", "x3"],
+                    "environment": ['scikit-learn="1.1.1"', "numpy"],
+                    "example_input": {
+                        "x0": [1.4, 1.4, 1.3],
+                        "x1": [0.2, 0.2, 0.2],
+                        "x2": [5.1, 4.9, 4.7],
+                        "x3": [3.5, 3.0, 3.2],
+                    },
+                    "model": {"file": "model.pkl"},
+                    "task": "tabular-classification",
+                }
+            },
+        ),
+        (
+            iris.data.values.tolist(),
+            "tabular-classification",
+            {
+                "sklearn": {
+                    "columns": ["x0", "x1", "x2", "x3"],
+                    "environment": ['scikit-learn="1.1.1"', "numpy"],
+                    "example_input": {
+                        "x0": [1.4, 1.4, 1.3],
+                        "x1": [0.2, 0.2, 0.2],
+                        "x2": [5.1, 4.9, 4.7],
+                        "x3": [3.5, 3.0, 3.2],
+                    },
+                    "model": {"file": "model.pkl"},
+                    "task": "tabular-classification",
+                }
+            },
+        ),
         (
             ["test", "text", "problem", "random"],
             "text-classification",
@@ -169,6 +205,30 @@ def test_validate_folder(config_json):
                 }
             },
         ),
+        (
+            np.array(["test", "text", "problem", "random"]),
+            "text-classification",
+            {
+                "sklearn": {
+                    "environment": ['scikit-learn="1.1.1"', "numpy"],
+                    "example_input": {"data": ["test", "text", "problem"]},
+                    "model": {"file": "model.pkl"},
+                    "task": "text-classification",
+                }
+            },
+        ),
+        (
+            (f"test{n}" for n in range(4)),
+            "text-classification",
+            {
+                "sklearn": {
+                    "environment": ['scikit-learn="1.1.1"', "numpy"],
+                    "example_input": {"data": ["test0", "test1", "test2"]},
+                    "model": {"file": "model.pkl"},
+                    "task": "text-classification",
+                }
+            },
+        ),
     ],
 )
 def test_create_config(data, task, expected_config):
@@ -475,7 +535,7 @@ def test_update_env(repo_path, config_json):
 def test_get_example_input():
     """Test the _get_example_input function."""
     with pytest.raises(
-        ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray."
+        ValueError, match="The data must be convertible to a 2D numpy.ndarray."
     ):
         _get_example_input(["a", "b", "c"])
 
@@ -494,7 +554,7 @@ def test_get_example_input():
 
 def test_get_column_names():
     with pytest.raises(
-        ValueError, match="The data is not a pandas.DataFrame or a numpy.ndarray."
+        ValueError, match="The data must be convertible to a 2D numpy.ndarray."
     ):
         _get_column_names(["a", "b", "c"])
 

From 5d7e75d1a3b0de5a72b86912223cde95700d14a4 Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Tue, 11 Oct 2022 20:03:33 +0200
Subject: [PATCH 04/11] Revert some erroneous changes

---
 skops/hub_utils/_hf_hub.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
index c393a184..17a99797 100644
--- a/skops/hub_utils/_hf_hub.py
+++ b/skops/hub_utils/_hf_hub.py
@@ -125,10 +125,9 @@ def _get_example_input(data):
 
         if isinstance(data, pd.DataFrame):
             return {x: data[x][:3].to_list() for x in data.columns}
-    except ImportError as e:
-        raise ValueError(
-            "The data cannot be a pandas.DataFrame because pandas is not installed."
-        ) from e
+    except ImportError:
+        # pandas is not installed, the data cannot be a pandas DataFrame
+        pass
 
     # here we convert the first three rows of the numpy array to a dict of lists
     # to be stored in the config file
@@ -159,10 +158,9 @@ def _get_column_names(data):
 
         if isinstance(data, pd.DataFrame):
             return list(data.columns)
-    except ImportError as e:
-        raise ValueError(
-            "The data cannot be a pandas.DataFrame because pandas is not installed."
-        ) from e
+    except ImportError:
+        # pandas is not installed, the data cannot be a pandas DataFrame
+        pass
 
     data_array = _convert_to_2d_numpy_array(data)
     return [f"x{x}" for x in range(data_array.shape[1])]

From ef89103d15e235f5e41a281fd4cc39d87c7d854e Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Tue, 11 Oct 2022 20:11:22 +0200
Subject: [PATCH 05/11] Update changelog

---
 docs/changes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/changes.rst b/docs/changes.rst
index ca2b49f8..e0c9c0c9 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -11,6 +11,7 @@ skops Changelog
 
 v0.3
 ----
+- Support more array-like data types for tabular data and list-like data types for text data. :pr:`179` by `Francesco Cariaggi`_.
 - Utility function to add arbitrary files to be uploaded to the hub by using
   :func:`.hub_utils.add_files`. :pr:`123` by `Benjamin Bossan`_.
 - Add ``private`` as an optional argument to :meth:`.hub_utils.push` to

From def297b600bba9c19a8bbf6140945e0802cdf45c Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Thu, 13 Oct 2022 18:33:41 +0200
Subject: [PATCH 06/11] Add myself as contributor

---
 docs/changes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/changes.rst b/docs/changes.rst
index e0c9c0c9..8f8fec47 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -56,4 +56,4 @@ Contributors
 ~~~~~~~~~~~~
 
 :user:`Adrin Jalali <adrinjalali>`, :user:`Merve Noyan <merveenoyan>`,
-:user:`Benjamin Bossan <BenjaminBossan>`
+:user:`Benjamin Bossan <BenjaminBossan>`, :user:`Francesco Cariaggi <anferico>`

From 100169ef85c12fb0a465607fe98989ff2722ee60 Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Thu, 13 Oct 2022 21:10:00 +0200
Subject: [PATCH 07/11] Address PR review comments

---
 skops/hub_utils/_hf_hub.py           | 68 ++++++++++++++++++++--------
 skops/hub_utils/tests/test_hf_hub.py | 36 ++++++++++++++-
 2 files changed, 84 insertions(+), 20 deletions(-)

diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
index 17a99797..5d6bc237 100644
--- a/skops/hub_utils/_hf_hub.py
+++ b/skops/hub_utils/_hf_hub.py
@@ -11,7 +11,7 @@
 import shutil
 import warnings
 from pathlib import Path
-from typing import Any, List, MutableMapping, Optional, Union
+from typing import Any, Iterable, List, MutableMapping, Optional, Union
 
 import numpy as np
 from huggingface_hub import HfApi, InferenceApi, snapshot_download
@@ -96,7 +96,7 @@ def _convert_to_2d_numpy_array(data):
         The numpy.ndarray object obtained by converting data.
     """
     data_array = np.asarray(data)
-    if len(data_array.shape) != 2:
+    if data_array.ndim != 2:
         raise ValueError("The data must be convertible to a 2D numpy.ndarray.")
 
     return data_array
@@ -131,8 +131,12 @@ def _get_example_input(data):
 
     # here we convert the first three rows of the numpy array to a dict of lists
     # to be stored in the config file
-    data_array = _convert_to_2d_numpy_array(data)
-    return {f"x{x}": data_array[:3, x].tolist() for x in range(data_array.shape[1])}
+    data_slice = data[:3]
+    data_slice_array = _convert_to_2d_numpy_array(data_slice)
+    return {
+        f"x{x}": data_slice_array[:3, x].tolist()
+        for x in range(data_slice_array.shape[1])
+    }
 
 
 def _get_column_names(data):
@@ -234,18 +238,48 @@ def recursively_default_dict() -> MutableMapping:
         config["sklearn"]["example_input"] = _get_example_input(data)
         config["sklearn"]["columns"] = _get_column_names(data)
     elif "text" in task:
-        if _is_iterable_of_strings(data):
-            config["sklearn"]["example_input"] = {
-                "data": list(itertools.islice(data, 3))
-            }
-        else:
-            raise ValueError("The data needs to be an iterable of strings.")
+        error_message = "The data needs to be an iterable of strings."
+        try:
+            data_head = _head(data, n=3)
+            if _is_sequence_of_strings(data_head):
+                config["sklearn"]["example_input"] = {"data": data_head}
+            else:
+                raise ValueError(error_message)
+        except TypeError as e:
+            raise ValueError(error_message) from e
 
     dump_json(Path(dst) / "config.json", config)
 
 
-def _is_iterable_of_strings(data):
-    """Checks whether data is an iterable of strings.
+def _head(data: Iterable, n: int):
+    """Returns the first n elements of data.
+
+    Raises a ``TypeError`` if data is not an iterable.
+
+    Parameters
+    ----------
+    data: Iterable
+        Any iterable.
+
+    n: int
+        Number of elements to extract from the head of data.
+
+    Raises
+    ------
+    TypeError
+        If data is not an iterable (raised by itertools.islice).
+
+    Returns
+    -------
+    data_head: list
+        A list containing the first n elements of data.
+    """
+    data, data_copy = itertools.tee(data, 2)
+    return list(itertools.islice(data_copy, n))
+
+
+def _is_sequence_of_strings(data):
+    """Checks whether data is a sequence of strings.
 
     Parameters
     ----------
@@ -254,18 +288,16 @@ def _is_iterable_of_strings(data):
 
     Returns
     -------
-    is_iterable_of_strings: bool
-        A boolean variable indicating whether or not data is an iterable of
+    is_sequence_of_strings: bool
+        A boolean variable indicating whether or not data is a sequence of
         strings.
     """
     if isinstance(data, str):
         return False
     try:
-        # needed in case data is an iterator or a generator
-        data, data_copy = itertools.tee(data, 2)
-        return all(isinstance(x, str) for x in data_copy)
+        return all(isinstance(x, str) for x in data)
     except TypeError:
-        # data is not iterable
+        # data isn't even iterable, can't be a sequence of strings
         return False
 
 
diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py
index ea8ab507..7905d94f 100644
--- a/skops/hub_utils/tests/test_hf_hub.py
+++ b/skops/hub_utils/tests/test_hf_hub.py
@@ -32,6 +32,8 @@
     _create_config,
     _get_column_names,
     _get_example_input,
+    _head,
+    _is_sequence_of_strings,
     _validate_folder,
 )
 from skops.hub_utils.tests.common import HF_HUB_TOKEN
@@ -256,7 +258,8 @@ def test_create_config(data, task, expected_config):
             )
 
 
-def test_create_config_invalid_text_data(temp_path):
+@pytest.mark.parametrize("data", [[1, 2, 3], 420])
+def test_create_config_invalid_text_data(data, temp_path):
     with pytest.raises(
         ValueError, match="The data needs to be an iterable of strings."
     ):
@@ -264,11 +267,40 @@ def test_create_config_invalid_text_data(temp_path):
             model_path="model.pkl",
             requirements=['scikit-learn="1.1.1"', "numpy"],
             task="text-classification",
-            data=[1, 2, 3],
+            data=data,
             dst=temp_path,
         )
 
 
+@pytest.mark.parametrize(
+    "data, n, expected_output",
+    [
+        ([0, "1", 2, 3, 4], 3, [0, "1", 2]),
+        ((i for i in range(5)), 3, [0, 1, 2]),
+    ],
+)
+def test_head(data, n, expected_output):
+    assert _head(data, n) == expected_output
+
+
+def test_head_invalid_iterable():
+    with pytest.raises(TypeError):
+        _head(420)
+
+
+@pytest.mark.parametrize(
+    "data, is_sequence_of_strings",
+    [
+        ("sample text", False),
+        (["sample", 420], False),
+        (420, False),
+        (["sample", "text"], True),
+    ],
+)
+def test_is_sequence_of_strings(data, is_sequence_of_strings):
+    assert _is_sequence_of_strings(data) == is_sequence_of_strings
+
+
 def test_atomic_init(classifier_pickle, temp_path):
     with pytest.raises(ValueError):
         # this fails since we're passing an invalid task.

From eab3c299d586c0a92c0b21e1756c2b722c92a065 Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Thu, 8 Dec 2022 11:08:10 +0100
Subject: [PATCH 08/11] Update changes.rst

---
 docs/changes.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/changes.rst b/docs/changes.rst
index 01d7c1d3..70453a3f 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -17,7 +17,8 @@ v0.4
 
 v0.3
 ----
-- Support more array-like data types for tabular data and list-like data types for text data. :pr:`179` by `Francesco Cariaggi`_.
+- Support more array-like data types for tabular data and list-like data types
+  for text data. :pr:`179` by `Francesco Cariaggi`_.
 - Utility function to add arbitrary files to be uploaded to the hub by using
   :func:`.hub_utils.add_files`. :pr:`123` by `Benjamin Bossan`_.
 - Add ``private`` as an optional argument to :meth:`skops.hub_utils.push` to

From e6f5d2e1d555f3e598c1c818222012641cacbe32 Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Thu, 8 Dec 2022 11:09:32 +0100
Subject: [PATCH 09/11] Address PR comments

---
 skops/hub_utils/_hf_hub.py           | 180 +++++++++++----------------
 skops/hub_utils/tests/test_hf_hub.py | 175 +++++++++-----------------
 2 files changed, 134 insertions(+), 221 deletions(-)

diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
index e011c12e..5df8734d 100644
--- a/skops/hub_utils/_hf_hub.py
+++ b/skops/hub_utils/_hf_hub.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 from huggingface_hub import HfApi, InferenceApi, snapshot_download
+from sklearn.utils import check_array
 
 from ..utils.fixes import Literal
 
@@ -73,69 +74,89 @@ def _validate_folder(path: Union[str, Path]) -> None:
         raise TypeError(f"Model file {model_path} does not exist.")
 
 
-def _convert_to_2d_numpy_array(data):
-    """Converts an array-like object to a 2D numpy.ndarray.
+def _get_example_input_from_tabular_data(data):
+    """Returns the example input of a model for a tabular task.
 
-    Raises a ``ValueError`` if data cannot be converted to a 2D numpy.ndarray.
+    The input is converted into a dictionary which is then stored in the config
+    file.
 
     Parameters
     ----------
-    data: pandas.DataFrame or array-like
-        Any object that can be converted to a 2D ``numpy.ndarray``, including
-        a ``pandas.DataFrame``.
-
-    Raises
-    ------
-    ValueError
-        Raised when the passed object cannot be converted to 2D numpy.ndarray.
+    data: array-like
+        The input needs to be either a ``pandas.DataFrame``, a 2D
+        ``numpy.ndarray`` or a list/tuple that can be converted to a 2D
+        ``numpy.ndarray``. The first 3 rows are used as example input.
 
     Returns
     -------
-    data_array: numpy.ndarray
-        The numpy.ndarray object obtained by converting data.
+    example_input: dict of lists
+        The example input of the model as accepted by Hugging Face's backend.
     """
-    data_array = np.asarray(data)
-    if data_array.ndim != 2:
-        raise ValueError("The data must be convertible to a 2D numpy.ndarray.")
+    try:
+        import pandas as pd
+
+        if isinstance(data, pd.DataFrame):
+            return {x: data[x][:3].to_list() for x in data.columns}
+    except ImportError:
+        # pandas is not installed, the data cannot be a pandas DataFrame
+        pass
 
-    return data_array
+    # here we convert the first three rows of `data` to a dict of lists
+    # to be stored in the config file
+    if isinstance(data, (np.ndarray, list, tuple)):
+        data_slice = data[:3]
+        # This will raise a ValueError if the array is not 2D
+        data_slice_array = check_array(data_slice, ensure_2d=True)
+        return {
+            f"x{x}": data_slice_array[:, x].tolist()
+            for x in range(data_slice_array.shape[1])
+        }
+
+    raise ValueError(
+        "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
+        "list/tuple that can be converted to a 2D numpy.ndarray."
+    )
 
 
-def _get_example_input(data):
-    """Returns the example input of a model.
+def _get_example_input_from_text_data(data: Iterable[str]):
+    """Returns the example input of a model for a text task.
 
     The input is converted into a dictionary which is then stored in the config
     file.
 
     Parameters
     ----------
-    data: pandas.DataFrame or array-like
-        The input needs to be anything that can be converted to a 2D
-        ``numpy.ndarray``, including a ``pandas.DataFrame``. The first 3 rows
-        are used as example input.
+    data: Iterable[str]
+        An iterable of strings. The first 3 elements are used as example input.
 
     Returns
     -------
     example_input: dict of lists
         The example input of the model as accepted by Hugging Face's backend.
     """
-    try:
-        import pandas as pd
 
-        if isinstance(data, pd.DataFrame):
-            return {x: data[x][:3].to_list() for x in data.columns}
-    except ImportError:
-        # pandas is not installed, the data cannot be a pandas DataFrame
-        pass
+    def _head(data, n):
+        data, data_copy = itertools.tee(data, 2)
+        return list(itertools.islice(data_copy, n))
 
-    # here we convert the first three rows of the numpy array to a dict of lists
-    # to be stored in the config file
-    data_slice = data[:3]
-    data_slice_array = _convert_to_2d_numpy_array(data_slice)
-    return {
-        f"x{x}": data_slice_array[:3, x].tolist()
-        for x in range(data_slice_array.shape[1])
-    }
+    def _is_sequence_of_strings(data):
+        if isinstance(data, str):
+            return False
+        try:
+            return all(isinstance(x, str) for x in data)
+        except TypeError:
+            # data isn't even iterable, can't be a sequence of strings
+            return False
+
+    error_message = "The data needs to be an iterable of strings."
+    try:
+        data_head = _head(data, n=3)
+        if _is_sequence_of_strings(data_head):
+            return {"data": data_head}
+        else:
+            raise ValueError(error_message)
+    except TypeError as e:
+        raise ValueError(error_message) from e
 
 
 def _get_column_names(data):
@@ -146,14 +167,14 @@ def _get_column_names(data):
 
     Parameters
     ----------
-    data: pandas.DataFrame or array-like
+    data: array-like
         The data whose columns names are to be returned. Must be a
-        ``pandas.DataFrame`` or anything that can be converted to a 2D
-        ``numpy.ndarray``
+        ``pandas.DataFrame``, a 2D ``numpy.ndarray`` or a list/tuple that can
+        be converted to a 2D ``numpy.ndarray``.
 
     Returns
     -------
-    columns: list of tuples
+    columns: list of strings
         A list of strings. Each string is a column name.
     """
     try:
@@ -165,8 +186,17 @@ def _get_column_names(data):
         # pandas is not installed, the data cannot be a pandas DataFrame
         pass
 
-    data_array = _convert_to_2d_numpy_array(data)
-    return [f"x{x}" for x in range(data_array.shape[1])]
+    # TODO: this is going to fail for Structured Arrays. We can add support for
+    # them later if we see need for it.
+    if isinstance(data, (np.ndarray, list, tuple)):
+        # This will raise a ValueError if the array is not 2D
+        data_array = check_array(data, ensure_2d=True)
+        return [f"x{x}" for x in range(data_array.shape[1])]
+
+    raise ValueError(
+        "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
+        "list/tuple that can be converted to a 2D numpy.ndarray."
+    )
 
 
 def _create_config(
@@ -234,72 +264,14 @@ def recursively_default_dict() -> MutableMapping:
     config["sklearn"]["task"] = task
 
     if "tabular" in task:
-        config["sklearn"]["example_input"] = _get_example_input(data)
+        config["sklearn"]["example_input"] = _get_example_input_from_tabular_data(data)
         config["sklearn"]["columns"] = _get_column_names(data)
     elif "text" in task:
-        error_message = "The data needs to be an iterable of strings."
-        try:
-            data_head = _head(data, n=3)
-            if _is_sequence_of_strings(data_head):
-                config["sklearn"]["example_input"] = {"data": data_head}
-            else:
-                raise ValueError(error_message)
-        except TypeError as e:
-            raise ValueError(error_message) from e
+        config["sklearn"]["example_input"] = _get_example_input_from_text_data(data)
 
     dump_json(Path(dst) / "config.json", config)
 
 
-def _head(data: Iterable, n: int):
-    """Returns the first n elements of data.
-
-    Raises a ``TypeError`` if data is not an iterable.
-
-    Parameters
-    ----------
-    data: Iterable
-        Any iterable.
-
-    n: int
-        Number of elements to extract from the head of data.
-
-    Raises
-    ------
-    TypeError
-        If data is not an iterable (raised by itertools.islice).
-
-    Returns
-    -------
-    data_head: list
-        A list containing the first n elements of data.
-    """
-    data, data_copy = itertools.tee(data, 2)
-    return list(itertools.islice(data_copy, n))
-
-
-def _is_sequence_of_strings(data):
-    """Checks whether data is a sequence of strings.
-
-    Parameters
-    ----------
-    data: Any
-        Any object.
-
-    Returns
-    -------
-    is_sequence_of_strings: bool
-        A boolean variable indicating whether or not data is a sequence of
-        strings.
-    """
-    if isinstance(data, str):
-        return False
-    try:
-        return all(isinstance(x, str) for x in data)
-    except TypeError:
-        # data isn't even iterable, can't be a sequence of strings
-        return False
-
-
 def _check_model_file(path: str | Path) -> Path:
     """Perform sanity checks on the model file
 
diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py
index 068e8c27..2252e684 100644
--- a/skops/hub_utils/tests/test_hf_hub.py
+++ b/skops/hub_utils/tests/test_hf_hub.py
@@ -32,9 +32,8 @@
 from skops.hub_utils._hf_hub import (
     _create_config,
     _get_column_names,
-    _get_example_input,
-    _head,
-    _is_sequence_of_strings,
+    _get_example_input_from_tabular_data,
+    _get_example_input_from_text_data,
     _validate_folder,
 )
 from skops.hub_utils.tests.common import HF_HUB_TOKEN
@@ -162,42 +161,6 @@ def test_validate_folder(config_json):
                 }
             },
         ),
-        (
-            iris.data.values,
-            "tabular-classification",
-            {
-                "sklearn": {
-                    "columns": ["x0", "x1", "x2", "x3"],
-                    "environment": ['scikit-learn="1.1.1"', "numpy"],
-                    "example_input": {
-                        "x0": [1.4, 1.4, 1.3],
-                        "x1": [0.2, 0.2, 0.2],
-                        "x2": [5.1, 4.9, 4.7],
-                        "x3": [3.5, 3.0, 3.2],
-                    },
-                    "model": {"file": "model.pkl"},
-                    "task": "tabular-classification",
-                }
-            },
-        ),
-        (
-            iris.data.values.tolist(),
-            "tabular-classification",
-            {
-                "sklearn": {
-                    "columns": ["x0", "x1", "x2", "x3"],
-                    "environment": ['scikit-learn="1.1.1"', "numpy"],
-                    "example_input": {
-                        "x0": [1.4, 1.4, 1.3],
-                        "x1": [0.2, 0.2, 0.2],
-                        "x2": [5.1, 4.9, 4.7],
-                        "x3": [3.5, 3.0, 3.2],
-                    },
-                    "model": {"file": "model.pkl"},
-                    "task": "tabular-classification",
-                }
-            },
-        ),
         (
             ["test", "text", "problem", "random"],
             "text-classification",
@@ -210,30 +173,6 @@ def test_validate_folder(config_json):
                 }
             },
         ),
-        (
-            np.array(["test", "text", "problem", "random"]),
-            "text-classification",
-            {
-                "sklearn": {
-                    "environment": ['scikit-learn="1.1.1"', "numpy"],
-                    "example_input": {"data": ["test", "text", "problem"]},
-                    "model": {"file": "model.pkl"},
-                    "task": "text-classification",
-                }
-            },
-        ),
-        (
-            (f"test{n}" for n in range(4)),
-            "text-classification",
-            {
-                "sklearn": {
-                    "environment": ['scikit-learn="1.1.1"', "numpy"],
-                    "example_input": {"data": ["test0", "test1", "test2"]},
-                    "model": {"file": "model.pkl"},
-                    "task": "text-classification",
-                }
-            },
-        ),
     ],
 )
 def test_create_config(data, task, expected_config):
@@ -261,49 +200,6 @@ def test_create_config(data, task, expected_config):
             )
 
 
-@pytest.mark.parametrize("data", [[1, 2, 3], 420])
-def test_create_config_invalid_text_data(data, temp_path):
-    with pytest.raises(
-        ValueError, match="The data needs to be an iterable of strings."
-    ):
-        _create_config(
-            model_path="model.pkl",
-            requirements=['scikit-learn="1.1.1"', "numpy"],
-            task="text-classification",
-            data=data,
-            dst=temp_path,
-        )
-
-
-@pytest.mark.parametrize(
-    "data, n, expected_output",
-    [
-        ([0, "1", 2, 3, 4], 3, [0, "1", 2]),
-        ((i for i in range(5)), 3, [0, 1, 2]),
-    ],
-)
-def test_head(data, n, expected_output):
-    assert _head(data, n) == expected_output
-
-
-def test_head_invalid_iterable():
-    with pytest.raises(TypeError):
-        _head(420)
-
-
-@pytest.mark.parametrize(
-    "data, is_sequence_of_strings",
-    [
-        ("sample text", False),
-        (["sample", 420], False),
-        (420, False),
-        (["sample", "text"], True),
-    ],
-)
-def test_is_sequence_of_strings(data, is_sequence_of_strings):
-    assert _is_sequence_of_strings(data) == is_sequence_of_strings
-
-
 def test_atomic_init(classifier_pickle, temp_path):
     with pytest.raises(ValueError):
         # this fails since we're passing an invalid task.
@@ -571,30 +467,75 @@ def test_update_env(repo_path, config_json):
     assert get_requirements(repo_path) == ['scikit-learn="1.1.2"']
 
 
-def test_get_example_input():
-    """Test the _get_example_input function."""
+def test_get_example_input_from_tabular_data():
     with pytest.raises(
-        ValueError, match="The data must be convertible to a 2D numpy.ndarray."
+        ValueError,
+        match=(
+            "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
+            "list/tuple that can be converted to a 2D numpy.ndarray."
+        ),
     ):
-        _get_example_input(["a", "b", "c"])
+        _get_example_input_from_tabular_data("random")
+    with pytest.raises(ValueError):
+        _get_example_input_from_tabular_data(["a", "b", "c"])
 
-    examples = _get_example_input(np.ones((5, 10)))
-    # the result if a dictionary of column name: list of values
+    examples = _get_example_input_from_tabular_data(np.ones((5, 10)))
+    # the result is a dictionary of column name: list of values
     assert len(examples) == 10
     assert len(examples["x0"]) == 3
 
-    examples = _get_example_input(
+    examples = _get_example_input_from_tabular_data(np.ones((5, 10)).tolist())
+    # the result is a dictionary of column name: list of values
+    assert len(examples) == 10
+    assert len(examples["x0"]) == 3
+
+    examples = _get_example_input_from_tabular_data(
         pd.DataFrame(np.ones((5, 10)), columns=[f"column{x}" for x in range(10)])
     )
-    # the result if a dictionary of column name: list of values
+    # the result is a dictionary of column name: list of values
     assert len(examples) == 10
     assert len(examples["column0"]) == 3
 
 
+def test_get_example_input_from_text_data():
+    examples = _get_example_input_from_text_data(["a", "b", "c", "d"])
+    assert len(examples) == 3
+
+    examples = _get_example_input_from_text_data(np.array(["a", "b", "c", "d"]))
+    assert len(examples) == 3
+
+    examples = _get_example_input_from_text_data((c for c in ["a", "b", "c", "d"]))
+    assert len(examples) == 3
+
+    examples = _get_example_input_from_text_data([])
+    assert len(examples) == 0
+
+
+@pytest.mark.parametrize("data", ["random", [1, 2, 3], 420])
+def test_get_example_input_from_text_data_invalid_text_data(data):
+    with pytest.raises(
+        ValueError, match="The data needs to be an iterable of strings."
+    ):
+        _get_example_input_from_text_data(data)
+
+
+def test_get_example_input_from_text_data_generator_not_exhausted():
+    generator = (f"s{x}" for x in range(3))
+    _get_example_input_from_text_data(generator)
+    # check that next() doesn't raise a StopIteration
+    next(generator)
+
+
 def test_get_column_names():
     with pytest.raises(
-        ValueError, match="The data must be convertible to a 2D numpy.ndarray."
+        ValueError,
+        match=(
+            "The data is not a pandas.DataFrame, a 2D numpy.ndarray or a "
+            "list/tuple that can be converted to a 2D numpy.ndarray."
+        ),
     ):
+        _get_column_names("random")
+    with pytest.raises(ValueError):
         _get_column_names(["a", "b", "c"])
 
     X_array = np.ones((5, 10), dtype=np.float32)
@@ -606,11 +547,11 @@ def test_get_column_names():
     assert _get_column_names(X_df) == expected_columns
 
 
-def test_get_example_input_pandas_not_installed(pandas_not_installed):
+def test_get_example_input_from_tabular_data_pandas_not_installed(pandas_not_installed):
     # use pandas_not_installed fixture from conftest.py to pretend that pandas
     # is not installed and check that the function does not raise when pandas
     # import fails
-    _get_example_input(np.ones((5, 10)))
+    _get_example_input_from_tabular_data(np.ones((5, 10)))
 
 
 def test_get_column_names_pandas_not_installed(pandas_not_installed):

From ecbdf385bd241ab076f6080b6261e95bd9176a3d Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Sun, 8 Jan 2023 19:17:58 +0100
Subject: [PATCH 10/11] Remove support for generators in text tasks + small
 fixes

---
 docs/changes.rst                     |  4 ++--
 skops/conftest.py                    |  4 +++-
 skops/hub_utils/_hf_hub.py           | 23 ++++++++++------------
 skops/hub_utils/tests/test_hf_hub.py | 29 ++++++++++------------------
 4 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/docs/changes.rst b/docs/changes.rst
index 60b26541..a66a68c0 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -26,11 +26,11 @@ v0.4
   section/New section": "content"})`` to add "content" a new subsection called
   "New section" to an existing section called "Existing section". :pr:`203` by
   `Benjamin Bossan`_.
+- Support more array-like data types for tabular data and list-like data types
+  for text data. :pr:`179` by `Francesco Cariaggi`_.
 
 v0.3
 ----
-- Support more array-like data types for tabular data and list-like data types
-  for text data. :pr:`179` by `Francesco Cariaggi`_.
 - Utility function to add arbitrary files to be uploaded to the hub by using
   :func:`.hub_utils.add_files`. :pr:`123` by `Benjamin Bossan`_.
 - Add ``private`` as an optional argument to :meth:`skops.hub_utils.push` to
diff --git a/skops/conftest.py b/skops/conftest.py
index a09c69b8..4dcaed83 100644
--- a/skops/conftest.py
+++ b/skops/conftest.py
@@ -7,10 +7,12 @@
 def pandas_not_installed():
     # patch import so that it raises an ImportError when trying to import
     # pandas. This works because pandas is only imported lazily.
+    orig_import = __import__
+
     def mock_import(name, *args, **kwargs):
         if name == "pandas":
             raise ImportError
-        return __import__(name, *args, **kwargs)
+        return orig_import(name, *args, **kwargs)
 
     with patch("builtins.__import__", side_effect=mock_import):
         yield
diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
index 3b3135de..76226912 100644
--- a/skops/hub_utils/_hf_hub.py
+++ b/skops/hub_utils/_hf_hub.py
@@ -10,14 +10,12 @@
 import os
 import shutil
 from pathlib import Path
-from typing import Any, List, MutableMapping, Optional, Sequence, Union
+from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union
 
 import numpy as np
 from huggingface_hub import HfApi, InferenceApi, snapshot_download
 from sklearn.utils import check_array
 
-from ..utils.fixes import Literal
-
 SUPPORTED_TASKS = [
     "tabular-classification",
     "tabular-regression",
@@ -127,7 +125,7 @@ def _get_example_input_from_text_data(data: Sequence[str]):
     Parameters
     ----------
     data: Sequence[str]
-        An sequence of strings. The first 3 elements are used as example input.
+        A sequence of strings. The first 3 elements are used as example input.
 
     Returns
     -------
@@ -136,17 +134,16 @@ def _get_example_input_from_text_data(data: Sequence[str]):
     """
 
     def _head(data, n):
-        data, data_copy = itertools.tee(data, 2)
-        return list(itertools.islice(data_copy, n))
+        def is_subscriptable(data):
+            return hasattr(data, "__getitem__")
+
+        if is_subscriptable(data):
+            return data[:n]
+
+        return list(itertools.islice(data, n))
 
     def _is_sequence_of_strings(data):
-        if isinstance(data, str):
-            return False
-        try:
-            return all(isinstance(x, str) for x in data)
-        except TypeError:
-            # data isn't even iterable, can't be a sequence of strings
-            return False
+        return not isinstance(data, str) and all(isinstance(x, str) for x in data)
 
     error_message = "The data needs to be a sequence of strings."
     try:
diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py
index 7687e5e6..1abbad2b 100644
--- a/skops/hub_utils/tests/test_hf_hub.py
+++ b/skops/hub_utils/tests/test_hf_hub.py
@@ -231,7 +231,7 @@ def test_create_config(data, task, expected_config):
 
 
 def test_create_config_invalid_text_data(temp_path):
-    with pytest.raises(ValueError, match="The data needs to be a list of strings."):
+    with pytest.raises(ValueError, match="The data needs to be a sequence of strings."):
         _create_config(
             model_path="model.pkl",
             requirements=['scikit-learn="1.1.1"', "numpy"],
@@ -553,34 +553,25 @@ def test_get_example_input_from_tabular_data():
 
 
 def test_get_example_input_from_text_data():
-    examples = _get_example_input_from_text_data(["a", "b", "c", "d"])
-    assert len(examples) == 3
+    example_input = _get_example_input_from_text_data(["a", "b", "c", "d"])
+    assert len(example_input["data"]) == 3
 
-    examples = _get_example_input_from_text_data(np.array(["a", "b", "c", "d"]))
-    assert len(examples) == 3
+    example_input = _get_example_input_from_text_data(np.array(["a", "b", "c", "d"]))
+    assert len(example_input["data"]) == 3
 
-    examples = _get_example_input_from_text_data((c for c in ["a", "b", "c", "d"]))
-    assert len(examples) == 3
+    example_input = _get_example_input_from_text_data(set(["a", "b", "c", "d"]))
+    assert len(example_input["data"]) == 3
 
-    examples = _get_example_input_from_text_data([])
-    assert len(examples) == 0
+    example_input = _get_example_input_from_text_data([])
+    assert len(example_input["data"]) == 0
 
 
 @pytest.mark.parametrize("data", ["random", [1, 2, 3], 420])
 def test_get_example_input_from_text_data_invalid_text_data(data):
-    with pytest.raises(
-        ValueError, match="The data needs to be an iterable of strings."
-    ):
+    with pytest.raises(ValueError, match="The data needs to be a sequence of strings."):
         _get_example_input_from_text_data(data)
 
 
-def test_get_example_input_from_text_data_generator_not_exhausted():
-    generator = (f"s{x}" for x in range(3))
-    _get_example_input_from_text_data(generator)
-    # check that next() doesn't raise a StopIteration
-    next(generator)
-
-
 def test_get_column_names():
     with pytest.raises(
         ValueError,

From 6b41b6935d7cd70f1e3198e7232724939c6ee6a7 Mon Sep 17 00:00:00 2001
From: anferico <f.cariaggi4@gmail.com>
Date: Thu, 19 Jan 2023 18:37:25 +0100
Subject: [PATCH 11/11] Add new section to changelog (v0.5)

---
 docs/changes.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/changes.rst b/docs/changes.rst
index a66a68c0..4e21b8ea 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -9,6 +9,11 @@ skops Changelog
     :depth: 1
     :local:
 
+v0.5
+----
+- Support more array-like data types for tabular data and list-like data types
+  for text data. :pr:`179` by `Francesco Cariaggi`_.
+
 v0.4
 ----
 - :func:`.io.dump` and :func:`.io.load` now work with file like objects,
@@ -26,8 +31,6 @@ v0.4
   section/New section": "content"})`` to add "content" a new subsection called
   "New section" to an existing section called "Existing section". :pr:`203` by
   `Benjamin Bossan`_.
-- Support more array-like data types for tabular data and list-like data types
-  for text data. :pr:`179` by `Francesco Cariaggi`_.
 
 v0.3
 ----