diff --git a/docs/changes.rst b/docs/changes.rst index cb63da52..c17e73c6 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -16,8 +16,8 @@ v0.9 estimators. :pr:`384` by :user:`Reid Johnson `. - Fix an issue with visualizing Skops files for `scikit-learn` tree estimators. :pr:`386` by :user:`Reid Johnson `. -- :func:`skops.hug_utils.get_model_output` is deprecated and will be removed in version - 0.10. :pr:`396` by `Adrin Jalali`_. +- :func:`skops.hub_utils.get_model_output` and :func:`skops.hub_utils.push` are + deprecated and will be removed in version 0.10. :pr:`396` by `Adrin Jalali`_. v0.8 ---- diff --git a/docs/index.rst b/docs/index.rst index e69e4f5a..76c325f9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,10 +15,8 @@ find the source code and the development discussions on `Github The following examples are good starting points: -- How to create and initialize a scikit-learn model repo: - :ref:`sphx_glr_auto_examples_plot_hf_hub.py`. You can see all the models - uploaded to the Hugging Face Hub using this library `here - `_. +- Improve your data science workflow with skops: + :ref:`sphx_glr_auto_examples_plot_california_housing.py` - How to create a model card for your scikit-learn based model: :ref:`sphx_glr_auto_examples_plot_model_card.py` - A text classification example, and its integration with the hub: diff --git a/examples/plot_california_housing.py b/examples/plot_california_housing.py index 3b8c7aa8..e20c067a 100644 --- a/examples/plot_california_housing.py +++ b/examples/plot_california_housing.py @@ -1,6 +1,6 @@ """ -Improve your data science workflow with skops and Hugging Face -============================================================== +Improve your data science workflow with skops +============================================= """ # %% @@ -48,7 +48,6 @@ import numpy as np import pandas as pd import sklearn -from huggingface_hub import HfApi from matplotlib.patches import Rectangle from sklearn.compose import ColumnTransformer from sklearn.datasets import fetch_california_housing @@ -1860,57 +1859,10 @@ os.listdir(hub_dir) # %% -# Perfect. In order to create a repository on the Hub, we need a token. -# When you’re logged into the Hub, you can find it here: -# https://huggingface.co/settings/tokens. For this exercise, we assume the -# token is set as an environment variable called ``HF_HUB_TOKEN``. -# We could also set it here directly by pasting it as a string, but -# generally we should keep the token secret. - - -# %% -token = os.environ["HF_HUB_TOKEN"] - -# %% -# Now it’s time to push the repository to the Hub. We think of a good -# name, which, together with the user name, will constitute the -# ``repo_id``: - -# %% -repo_name = "example-california-housing" -user_name = HfApi().whoami(token=token)["name"] -repo_id = f"{user_name}/{repo_name}" -print(f"Creating and pushing to repo: {repo_id}") - -# %% [markdown] -# Finally, we call `hub_utils.push` like this: - -# %% -hub_utils.push( - repo_id=repo_id, - source=hub_dir, - token=token, - create_remote=True, - private=False, -) - -# %% -# We might consider changing the ``private`` argument here, -# depending on our goal. However, we can always change it later on the -# repository settings if we want to. - -# %% -# Now let’s print the full URL and visit it: - -# %% -print(f"Visit the following URL: https://huggingface.co/{repo_id}") - -# %% -# We can now visit the page and see the rendered model card, we can use -# the inference widget to try out what the model would predict for a given -# input (the warmup for the widget may take a while), other people can -# comment on the repo and make PRs, etc. So let’s share the link with our -# interested friends and colleagues! +# Creating the Repo and Pushing to Hugging Face Hub +# You can use the tools available in ``huggingface_hub`` to create a repo and +# push the contents of the repo folder to that repo. For more information visit +# https://huggingface.co/docs/huggingface_hub/index # %% # Conclusion diff --git a/examples/plot_hf_hub.py b/examples/plot_hf_hub.py deleted file mode 100644 index a10e6b3a..00000000 --- a/examples/plot_hf_hub.py +++ /dev/null @@ -1,172 +0,0 @@ -""" -scikit-learn models on Hugging Face Hub ---------------------------------------- - -This guide demonstrates how you can use this package to create a Hugging Face -Hub model repository based on a scikit-learn compatible model, and how to -fetch scikit-learn compatible models from the Hub and run them locally. -""" - -# %% -# Imports -# ======= -# First we will import everything required for the rest of this document. - -import json -import os -import pickle -from pathlib import Path -from tempfile import mkdtemp, mkstemp -from uuid import uuid4 - -import sklearn -from huggingface_hub import HfApi -from sklearn.datasets import load_breast_cancer -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.experimental import enable_halving_search_cv # noqa -from sklearn.model_selection import HalvingGridSearchCV, train_test_split - -from skops import card, hub_utils - -# %% -# Data -# ==== -# Then we create some random data to train and evaluate our model. - -X, y = load_breast_cancer(as_frame=True, return_X_y=True) -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.3, random_state=42 -) -print("X's summary: ", X.describe()) -print("y's summary: ", y.describe()) - - -# %% -# Train a Model -# ============= -# Using the above data, we train a model. To select the model, we use -# :class:`~sklearn.model_selection.HalvingGridSearchCV` with a parameter grid -# over :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. - -param_grid = { - "max_leaf_nodes": [5, 10, 15], - "max_depth": [2, 5, 10], -} - -model = HalvingGridSearchCV( - estimator=HistGradientBoostingClassifier(), - param_grid=param_grid, - random_state=42, - n_jobs=-1, -).fit(X_train, y_train) -model.score(X_test, y_test) - -# %% -# Initialize a Model Repo -# ======================= -# We now initialize a model repository locally, and push it to the hub. For -# that, we need to first store the model as a pickle file and pass it to the -# hub tools. - -# The file name is not significant, here we choose to save it with a `pkl` -# extension. -_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") -with open(pkl_name, mode="bw") as f: - pickle.dump(model, file=f) - -local_repo = mkdtemp(prefix="skops-") -hub_utils.init( - model=pkl_name, - requirements=[f"scikit-learn={sklearn.__version__}"], - dst=local_repo, - task="tabular-classification", - data=X_test, -) -if "__file__" in locals(): # __file__ not defined during docs built - # Add this script itself to the files to be uploaded for reproducibility - hub_utils.add_files(__file__, dst=local_repo) - -# %% -# We can no see what the contents of the created local repo are: -print(os.listdir(local_repo)) - -# %% -# Model Card -# ========== -# We will now create a model card and save it. For more information about how -# to create a good model card, refer to the :ref:`model card example -# `. The following code uses -# :func:`~skops.card.metadata_from_config` which creates a minimal metadata -# object to be included in the metadata section of the model card. The -# configuration used by this method is stored in the ``config.json`` file which -# is created by the call to :func:`~skops.hub_utils.init`. -model_card = card.Card(model, metadata=card.metadata_from_config(Path(local_repo))) -model_card.save(Path(local_repo) / "README.md") - -# %% -# Push to Hub -# =========== -# And finally, we can push the model to the hub. This requires a user access -# token which you can get under https://huggingface.co/settings/tokens - -# you can put your own token here, or set it as an environment variable before -# running this script. -token = os.environ["HF_HUB_TOKEN"] - -repo_name = f"hf_hub_example-{uuid4()}" -user_name = HfApi().whoami(token=token)["name"] -repo_id = f"{user_name}/{repo_name}" -print(f"Creating and pushing to repo: {repo_id}") - -# %% -# Now we can push our files to the repo. The following function creates the -# remote repository if it doesn't exist; this is controlled via the -# ``create_remote`` argument. Note that here we're setting ``private=True``, -# which means only people with the right permissions would see the model. Set -# ``private=False`` to make it visible to the public. - -hub_utils.push( - repo_id=repo_id, - source=local_repo, - token=token, - commit_message="pushing files to the repo from the example!", - create_remote=True, - private=True, -) - -# %% -# Once uploaded, other users can download and use it, unless you make the repo -# private. Given a repository's name, here's how one can download it: -repo_copy = mkdtemp(prefix="skops") -hub_utils.download(repo_id=repo_id, dst=repo_copy, token=token) -print(os.listdir(repo_copy)) - - -# %% -# You can also get the requirements of this repository: -print(hub_utils.get_requirements(path=repo_copy)) - -# %% -# As well as the complete configuration of the project: -print(json.dumps(hub_utils.get_config(path=repo_copy), indent=2)) - -# %% -# Now you can check the contents of the repository under your user. -# -# Update Requirements -# =================== -# If you update your environment and the versions of your requirements are -# changed, you can update the requirement in your repo by calling -# ``update_env``, which automatically detects the existing installation of the -# current environment and updates the requirements accordingly. - -hub_utils.update_env(path=local_repo, requirements=["scikit-learn"]) - -# %% -# Delete Repository -# ================= -# At the end, you can also delete the repository you created using -# ``HfApi().delete_repo``. For more information please refer to the -# documentation of ``huggingface_hub`` library. - -HfApi().delete_repo(repo_id=repo_id, token=token) diff --git a/examples/plot_intelex.py b/examples/plot_intelex.py deleted file mode 100644 index b451d1f4..00000000 --- a/examples/plot_intelex.py +++ /dev/null @@ -1,250 +0,0 @@ -""" -Creating models that are accelerated by Intel(R) Extension for scikit-learn ---------------------------------------------------------------------------- - -Introduction -============ - -This guide demonstrates how under certain conditions, Intel(R) Extension for -Scikit-learn (also ``scikit-learn-intelex``, or ``sklearnex``) can be used to -speed up inference of Scikit-learn models. - -The extension supports most of Scikit-learn's classical machine learning -algorithms, like k-nearest neighbors, support vector machines, linear/logistic -regression, and more. Stock Scikit-learn implementations are used where no -optimized version is available, making this package 100% compatible with -existing code. Note while compatibility is assured by continuous testing, -equivalence of results between the two packages is not guaranteed. In fact, due -to independent implementations, intermediate results differ in many cases. An -up-to-date list of supported algorithms can be found in the `official -documentation `_. - -Intel(R) Extension for Scikit-learn accelerates Scikit-learn algorithms by using -the latest hardware features and optimized caching and threading strategies. -Find more details in Intel's blog posts on Medium (`1 -`_, -`2 -`_). -In many cases, optimizations translate to hardware from other vendors, albeit -with smaller performance gains. - -For this example, we train two simple -:class:`sklearn.neighbors.KNeighborsClassifier` instances, one with and one -without using ``sklearnex``, and compare inference times. Afterward, we upload -both models to the Hugging Face Model Hub. Hugging Face Hub supports -``sklearnex``-optimized models, meaning the achieved speedup will translate for -Inference API users. -""" - -# %% -# Imports -# ======= -# First, we import everything required for the rest of this document. - -import os -import pickle -from pathlib import Path -from tempfile import NamedTemporaryFile, mkdtemp -from time import perf_counter -from uuid import uuid4 - -from huggingface_hub import delete_repo, whoami -from sklearn.datasets import make_classification -from sklearn.metrics import log_loss -from sklearn.model_selection import train_test_split -from sklearn.neighbors import KNeighborsClassifier -from sklearnex.neighbors import KNeighborsClassifier as KNeighborsClassifierOptimized - -from skops import card, hub_utils - -# %% -# Data -# ==== -# Next, we create some generic data. A dataset of 50k rows x 15 columns is big enough -# to showcase a performance gain from using ``sklearnex``. Generally speaking, -# larger datasets will benefit more from the ``sklearnex`` optimizations. More -# details can be found in the official -# `README `_. -X, y = make_classification( - n_samples=50_000, - n_features=15, - n_informative=15, - n_redundant=0, - n_clusters_per_class=1, - shuffle=False, - random_state=42, -) -X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.3, random_state=42 -) - -# %% -# Training the stock model -# ======================== -# Now we can train a stock Scikit-learn -# :class:`sklearn.neighbors.KNeighborsClassifier` - -clf = KNeighborsClassifier(3, n_jobs=-1) -start = perf_counter() -clf.fit(X_train, y_train) -print(f"Training finished in {perf_counter() - start:.2f}s") - -# %% -# Training the optimized model -# ============================ -# Now we fit the optimized algorithm. Note, that rather than loading the model -# from ``sklearnex``, we could also load and call ``patch_sklearn()``. Find more -# details in the `documentation -# `_. - -clf_opt = KNeighborsClassifierOptimized(3, n_jobs=-1) -start = perf_counter() -clf_opt.fit(X_train, y_train) -print(f"Training finished in {perf_counter() - start:.2f}s") - -# %% -# We are not comparing the k-NN fit times, since this is not a compute-intensive -# task and both are typically very fast. - -# %% -# Comparing inference times -# ========================= -# Now to the interesting part: We measure the execution time of -# ``predict_proba()`` for the two models. - -start = perf_counter() -y_proba = clf.predict_proba(X_test) -t_stock = perf_counter() - start - -log_loss_score = log_loss(y_test, y_proba) -print( - f"[stock scikit-learn] Inference took t_stock = {t_stock:.2f}s with a " - f"log-loss score of {log_loss_score:.3f}" -) - -start = perf_counter() -y_proba = clf_opt.predict_proba(X_test) -t_opt = perf_counter() - start - -log_loss_score = log_loss(y_test, y_proba) -print( - f"[sklearnex] Inference took t_opt = {t_opt:.2f}s with a log-loss score of" - f" {log_loss_score:.3f}" -) - -print(f"t_stock / t_opt = {t_stock/t_opt:.1f}") - -# %% -# We see that inference using ``sklearnex`` is a lot faster while achieving the -# same log-loss score. - -# %% -# Save and upload the models -# ========================== -# Let's save all required files to disk and initialize Hugging Face Model Hub -# repositories. - -# replace with your own token or set it as an environment variable before -# running the script -token = os.environ["HF_HUB_TOKEN"] - -with NamedTemporaryFile(mode="bw", prefix="stock-", suffix=".pkl") as fp: - pickle.dump(clf, file=fp) - - stock_repo = mkdtemp(prefix="stock-") - hub_utils.init( - model=fp.name, - requirements=["scikit-learn=1.2.1"], - dst=stock_repo, - task="tabular-classification", - data=X_test, - ) - - -with NamedTemporaryFile(mode="bw", prefix="opt-", suffix=".pkl") as fp: - pickle.dump(clf_opt, file=fp) - - opt_repo = mkdtemp(prefix="opt-") - hub_utils.init( - model=fp.name, - requirements=["scikit-learn=1.2.1", "scikit-learn-intelex=2023.0.1"], - dst=opt_repo, - task="tabular-classification", - data=X_test, - use_intelex=True, - ) - -# Create Model cards with the most basic information -clf_card = card.Card(clf, metadata=card.metadata_from_config(Path(stock_repo))) -clf_card.metadata.license = "mit" -limitations = "This model is not ready to be used in production." -model_description = ( - "This is a `KNeighborsClassifier` model trained on synthetic data. It is " - "trained with the stock scikit-learn algorithm and part of a " - "demonstration, showing how Intel(R) Extension for scikit-learn can be " - "used to speed up model inference times." -) -model_card_authors = "skops_user" -citation_bibtex = "**BibTeX**\n\n```\n@inproceedings{...,year={2020}}\n```" -clf_card.add( - **{ - "Citation": citation_bibtex, - "Model Card Authors": model_card_authors, - "Model description": model_description, - "Model description/Intended uses & limitations": limitations, - } -) -clf_card.save(Path(stock_repo) / "README.md") - -clf_opt_card = card.Card(clf_opt, metadata=card.metadata_from_config(Path(opt_repo))) -model_description = ( - "This is a `KNeighborsClassifier` model trained on synthetic data. It is " - "trained with the Intel(R) extension for scikit-learn optimized version of " - "the algorithm, and part of a demonstration, showing how Intel(R) " - "Extension for scikit-learn can be used to speed up model inference times." -) -clf_card.add( - **{ - "Citation": citation_bibtex, - "Model Card Authors": model_card_authors, - "Model description": model_description, - "Model description/Intended uses & limitations": limitations, - } -) -clf_opt_card.save(Path(opt_repo) / "README.md") - -# Push everything to the Model hub -user_name = whoami(token=token)["name"] -uuid = uuid4() -repo_id_stock = f"{user_name}/knn-example-stock-{uuid}" -repo_id_opt = f"{user_name}/knn-example-intelex-{uuid}" - -print(f"Pushing skl model to: {repo_id_stock}") -hub_utils.push( - repo_id=repo_id_stock, - source=stock_repo, - token=token, - commit_message="Add scikit-learn KNN model example", - create_remote=True, - private=False, -) -print(f"Pushing sklearnex model to: {repo_id_opt}") -hub_utils.push( - repo_id=repo_id_opt, - source=opt_repo, - token=token, - commit_message="Add scikit-learn-intelex KNN model example", - create_remote=True, - private=False, -) - - -# %% -# Delete Repository -# ================= -# At the end, we can delete the created repositories again using -# ``delete_repo``. For more information please refer to the -# documentation of ``huggingface_hub`` library. - -delete_repo(repo_id=repo_id_stock, token=token) -delete_repo(repo_id=repo_id_opt, token=token) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index fb161388..660ed6b6 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -574,6 +574,13 @@ def push( This function raises a ``TypeError`` if the contents of the source folder do not make a valid Hugging Face Hub scikit-learn based repo. """ + warnings.warn( + "Creating repos on hf.co is subject to strict rate limits now and therefore" + " this feature is to be removed from this library in version 0.10. You can" + " use tools directly available in the huggingface_hub library instead to" + " create and push files.", + FutureWarning, + ) _validate_folder(path=source) client = HfApi() diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index 59b51ebb..e4f8ddbb 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -15,14 +15,12 @@ import sklearn from flaky import flaky from huggingface_hub import HfApi -from huggingface_hub.utils import RepositoryNotFoundError from sklearn.datasets import load_diabetes, load_iris from sklearn.linear_model import LinearRegression, LogisticRegression from skops import card from skops.hub_utils import ( add_files, - download, get_config, get_model_output, get_requirements, @@ -364,66 +362,10 @@ def test_init_empty_model_file_errors(repo_path, config_json): model_path.unlink(missing_ok=True) -@pytest.mark.network -@flaky(max_runs=3) -@pytest.mark.parametrize("explicit_create", [True, False]) -def test_push_download( - explicit_create, - repo_path, - destination_path, - classifier, - config_json, -): - config_path, file_format = config_json - client = HfApi() - - version = metadata.version("scikit-learn") - init( - model=classifier, - requirements=[f'scikit-learn="{version}"'], - dst=destination_path, - task="tabular-classification", - data=iris.data, - ) - - user = client.whoami(token=HF_HUB_TOKEN)["name"] - repo_id = f"{user}/test-{uuid4()}" - if explicit_create: - client.create_repo(repo_id=repo_id, token=HF_HUB_TOKEN, repo_type="model") - push( - repo_id=repo_id, - source=repo_path, - token=HF_HUB_TOKEN, - commit_message="test message", - create_remote=True, - private=True, - ) - - # TODO: remove 1st message when huggingface_hub < v0.12 is dropped - # message changes in huggingface_hub v0.12, test both - match = ( - "If the repo is private, make sure you are authenticated" - "|" - "If you are trying to access a private or gated repo, " - "make sure you are authenticated" - ) - with pytest.raises(RepositoryNotFoundError, match=match): - download(repo_id=repo_id, dst="/tmp/test") - - with pytest.raises(OSError, match="None-empty dst path already exists!"): - download(repo_id=repo_id, dst=destination_path, token=HF_HUB_TOKEN) - - files = client.list_repo_files(repo_id=repo_id, use_auth_token=HF_HUB_TOKEN) - for f_name in [classifier.name, config_path.name]: - assert f_name in files - - try: - with tempfile.TemporaryDirectory(prefix="skops-test") as dst: - download(repo_id=repo_id, dst=dst, token=HF_HUB_TOKEN, keep_cache=False) - copy_files = os.listdir(dst) - assert set(copy_files) == set(files) - finally: - client.delete_repo(repo_id=repo_id, token=HF_HUB_TOKEN) +def test_push_deprecation(): + with pytest.raises(Exception): + with pytest.warns(FutureWarning, match="Creating repos on hf.co is subject"): + push(repo_id="dummy", source=".") @pytest.fixture