-
Notifications
You must be signed in to change notification settings - Fork 61
ENH add examples and dtypes to CardData and config.json #45
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0bbaf42
65b191c
c7a742a
475bc43
c1f3a10
d85a7f4
01c507b
4b6b9c7
a9bc358
95f47a2
a5c02f6
b20b939
080af4e
0694caf
a35f8a8
240211e
1d63e35
cee60eb
85713c4
b056cb8
2c174c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,11 +8,21 @@ | |
| import json | ||
| import shutil | ||
| from pathlib import Path | ||
| from typing import Any, MutableMapping, Union | ||
| from typing import Any, List, MutableMapping, Union | ||
|
|
||
| import numpy as np | ||
| from huggingface_hub import HfApi, snapshot_download | ||
| from requests import HTTPError | ||
|
|
||
| from ..utils.fixes import Literal | ||
|
|
||
| SUPPORTED_TASKS = [ | ||
| "tabular-classification", | ||
| "tabular-regression", | ||
| "text-classification", | ||
| "text-regression", | ||
|
adrinjalali marked this conversation as resolved.
|
||
| ] | ||
|
|
||
|
|
||
| def _validate_folder(path: Union[str, Path]) -> None: | ||
| """Validate the contents of a folder. | ||
|
|
@@ -57,8 +67,85 @@ def _validate_folder(path: Union[str, Path]) -> None: | |
| raise TypeError(f"Model file {model_path} does not exist.") | ||
|
|
||
|
|
||
| def _get_example_input(data): | ||
| """Returns the example input of a model. | ||
|
|
||
| The input is converted into a dictionary which is then stored in the config | ||
| file. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| data: array-like | ||
| The input needs to be either a ``pandas.DataFrame`` or a | ||
| ``numpy.ndarray``. The first 3 rows are used as example input. | ||
|
adrinjalali marked this conversation as resolved.
|
||
|
|
||
| Returns | ||
| ------- | ||
| example_input: dict of lists | ||
| The example input of the model as accepted by HuggingFace's backend. | ||
| """ | ||
| try: | ||
| import pandas as pd | ||
|
|
||
| if isinstance(data, pd.DataFrame): | ||
| return {x: data[x][:3].to_list() for x in data.columns} | ||
| except ImportError: | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See #53 for discussion on pandas dependency. |
||
| # pandas is not installed, the data cannot be a pandas DataFrame | ||
| pass | ||
|
|
||
| # here we convert the first three rows of the numpy array to a dict of lists | ||
| # to be stored in the config file | ||
| if isinstance(data, np.ndarray): | ||
| return {f"x{x}": data[:3, x].tolist() for x in range(data.shape[1])} | ||
|
|
||
| raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") | ||
|
|
||
|
|
||
| def _get_column_names(data): | ||
| """Returns the column names of the input. | ||
|
|
||
| If data is a ``numpy.ndarray``, column names are assumed to be ``x0`` to | ||
| ``xn-1``, where ``n`` is the number of columns. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| data: pandas.DataFrame or numpy.ndarray | ||
| The data whose columns names are to be returned. | ||
|
|
||
| Returns | ||
| ------- | ||
| columns: list of tuples | ||
| A list of strings. Each string is a column name. | ||
| """ | ||
| try: | ||
| import pandas as pd | ||
|
|
||
| if isinstance(data, pd.DataFrame): | ||
| return list(data.columns) | ||
| except ImportError: | ||
| # pandas is not installed, the data cannot be a pandas DataFrame | ||
| pass | ||
|
|
||
| # TODO: this is going to fail for Structured Arrays. We can add support for | ||
| # them later if we see need for it. | ||
| if isinstance(data, np.ndarray): | ||
| return [f"x{x}" for x in range(data.shape[1])] | ||
|
|
||
| raise ValueError("The data is not a pandas.DataFrame or a numpy.ndarray.") | ||
|
|
||
|
|
||
| def _create_config( | ||
| *, model_path: Union[str, Path], requirements: list[str], dst: Union[str, Path] | ||
| *, | ||
| model_path: Union[str, Path], | ||
| requirements: List[str], | ||
| dst: Union[str, Path], | ||
| task: Literal[ | ||
| "tabular-classification", | ||
| "tabular-regression", | ||
| "text-classification", | ||
| "text-regression", | ||
| ], | ||
| data, | ||
| ) -> None: | ||
| """Write the configuration into a `config.json` file. | ||
|
|
||
|
|
@@ -74,6 +161,25 @@ def _create_config( | |
| dst : str, or Path | ||
| The path to an existing folder where the config file should be created. | ||
|
|
||
| task: "tabular-classification", "tabular-regression", | ||
| "text-classification", / | ||
| or "text-regression" | ||
| The task of the model, which determines the input and output type of | ||
| the model. It can be one of: ``tabular-classification``, | ||
| ``tabular-regression``, ``text-classification``, ``text-regression``. | ||
|
|
||
| data: array-like | ||
| The input to the model. This is used for two purposes: | ||
|
|
||
| 1. Save an example input to the model, which is used by | ||
| HuggingFace's backend and shown in the widget of the model's | ||
| page. | ||
| 2. Store the columns and their order of the input, which is used by | ||
| HuggingFace's backend to pass the data in the right form to the | ||
| model. | ||
|
|
||
| The first 3 input values are used as example inputs. | ||
|
|
||
| Returns | ||
| ------- | ||
| None | ||
|
|
@@ -87,13 +193,33 @@ def recursively_default_dict() -> MutableMapping: | |
| config = recursively_default_dict() | ||
| config["sklearn"]["model"]["file"] = str(model_path) | ||
| config["sklearn"]["environment"] = requirements | ||
| config["sklearn"]["task"] = task | ||
|
|
||
| if "tabular" in task: | ||
| config["sklearn"]["example_input"] = _get_example_input(data) | ||
| config["sklearn"]["columns"] = _get_column_names(data) | ||
| elif "text" in task: | ||
| if isinstance(data, list) and all(isinstance(x, str) for x in data): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to Say we don't require lists, the code below assumes that data is Sequence (can be index). If we don't want to make that assumption, we could go for
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I know we're being very restrictive with the types we accept here. We also don't accept all array-like things accepted by sklearn. Adding support for those is nice, and necessary, but I was hoping to have that in a future PR and have something basic here, which doesn't necessarily limit users. They can still pass a list to this method even if their data is originally in a Opened #65 to track this. |
||
| config["sklearn"]["example_input"] = {"data": data[:3]} | ||
| else: | ||
| raise ValueError("The data needs to be a list of strings.") | ||
|
|
||
| with open(Path(dst) / "config.json", mode="w") as f: | ||
| json.dump(config, f, sort_keys=True, indent=4) | ||
|
|
||
|
|
||
| def init( | ||
| *, model: Union[str, Path], requirements: list[str], dst: Union[str, Path] | ||
| *, | ||
| model: Union[str, Path], | ||
| requirements: List[str], | ||
| dst: Union[str, Path], | ||
| task: Literal[ | ||
| "tabular-classification", | ||
| "tabular-regression", | ||
| "text-classification", | ||
| "text-regression", | ||
| ], | ||
| data, | ||
|
adrinjalali marked this conversation as resolved.
|
||
| ) -> None: | ||
| """Initialize a scikit-learn based HuggingFace repo. | ||
|
|
||
|
|
@@ -112,19 +238,52 @@ def init( | |
| dst: str, or Path | ||
| The path to a non-existing or empty folder which is to be initialized. | ||
|
|
||
| task: str | ||
| The task of the model, which determines the input and output type of | ||
| the model. It can be one of: ``tabular-classification``, | ||
| ``tabular-regression``, ``text-classification``, ``text-regression``. | ||
|
|
||
| data: array-like | ||
| The input to the model. This is used for two purposes: | ||
|
|
||
| 1. Save an example input to the model, which is used by | ||
| HuggingFace's backend and shown in the widget of the model's | ||
| page. | ||
| 2. Store the columns and their order of the input, which is used by | ||
| HuggingFace's backend to pass the data in the right form to the | ||
| model. | ||
|
|
||
| The first 3 input values are used as example inputs. | ||
|
|
||
| If ``task`` is ``tabular-classification`` or ``tabular-regression``, | ||
| the data needs to be a ``pandas.DataFrame`` or a ``numpy.ndarray``. If | ||
| ``task`` is ``text-classification`` or ``text-regression``, the data | ||
| needs to be a ``list`` of strings. | ||
|
|
||
| Returns | ||
| ------- | ||
| None | ||
| """ | ||
| dst = Path(dst) | ||
| if dst.exists() and bool(next(dst.iterdir(), None)): | ||
| raise OSError("None-empty dst path already exists!") | ||
|
|
||
| if task not in SUPPORTED_TASKS: | ||
| raise ValueError( | ||
| f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}" | ||
| ) | ||
| dst.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| shutil.copy2(src=model, dst=dst) | ||
|
|
||
| model_name = Path(model).name | ||
| _create_config(model_path=model_name, requirements=requirements, dst=dst) | ||
| _create_config( | ||
| model_path=model_name, | ||
| requirements=requirements, | ||
| dst=dst, | ||
| task=task, | ||
| data=data, | ||
| ) | ||
|
|
||
|
|
||
| def update_env( | ||
|
|
@@ -222,7 +381,7 @@ def get_config(path: Union[str, Path]) -> dict[str, Any]: | |
| Parameters | ||
| ---------- | ||
| path: str | ||
| The path to the director holding the project and its ``config.json`` | ||
| The path to the directory holding the project and its ``config.json`` | ||
|
adrinjalali marked this conversation as resolved.
|
||
| configuration file. | ||
|
|
||
| Returns | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.