diff --git a/examples/plot_intelex.py b/examples/plot_intelex.py index b451d1f4..aac6863c 100644 --- a/examples/plot_intelex.py +++ b/examples/plot_intelex.py @@ -150,7 +150,7 @@ with NamedTemporaryFile(mode="bw", prefix="stock-", suffix=".pkl") as fp: pickle.dump(clf, file=fp) - + fp.seek(0) stock_repo = mkdtemp(prefix="stock-") hub_utils.init( model=fp.name, @@ -163,7 +163,7 @@ with NamedTemporaryFile(mode="bw", prefix="opt-", suffix=".pkl") as fp: pickle.dump(clf_opt, file=fp) - + fp.seek(0) opt_repo = mkdtemp(prefix="opt-") hub_utils.init( model=fp.name, diff --git a/examples/plot_tabular_regression.py b/examples/plot_tabular_regression.py index 5e9d8fb0..b7bdd6b3 100644 --- a/examples/plot_tabular_regression.py +++ b/examples/plot_tabular_regression.py @@ -63,7 +63,7 @@ # Initialize a repository to save our files in # ============================================ # We will now initialize a repository and save our model -_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") +_, pkl_name = mkstemp(prefix="skops-", suffix=".skops") with open(pkl_name, mode="bw") as f: sio.dump(model, file=f) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index efbd5c25..9f120471 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -10,12 +10,17 @@ import os import shutil from pathlib import Path +from pickle import dump as pickle_dump from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union import numpy as np from huggingface_hub import HfApi, InferenceApi, snapshot_download +from sklearn.base import BaseEstimator from sklearn.utils import check_array +from skops import card, io +from skops.card._model_card import _load_model + SUPPORTED_TASKS = [ "tabular-classification", "tabular-regression", @@ -405,8 +410,6 @@ def init( f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}" ) - model = _check_model_file(model) - dst.mkdir(parents=True, exist_ok=True) # add intelex requirement, if it's used and not already in requirements @@ -415,19 +418,64 @@ def init( ): requirements.append("scikit-learn-intelex") + # model parameter can be either a path or a model object try: - shutil.copy2(src=model, dst=dst) - - model_name = model.name - _create_config( - model_path=model_name, - requirements=requirements, - dst=dst, - task=task, - data=data, - model_format=model_format, - use_intelex=use_intelex, - ) + if isinstance(model, (str, Path)): + model = _check_model_file(model) + shutil.copy2(src=model, dst=dst) + + model_name = model.name + + _create_config( + model_path=model_name, + requirements=requirements, + dst=dst, + task=task, + data=data, + model_format=model_format, + use_intelex=use_intelex, + ) + + # load model from file + model = _load_model(model, trusted=True) + model_card = card.Card(model, metadata=card.metadata_from_config(dst)) + model_card.save(dst / "README.md") + elif isinstance(model, BaseEstimator): + # if it is a model object and its format is set to auto, choose skops by default + if model_format == "auto": + model_format = "skops" + elif model_format in ["pkl", "pickle", "joblib"]: + model_format = "pickle" + model_name = Path(dst / f"model.{model_format}") + + _create_config( + model_path=model_name, + requirements=requirements, + dst=dst, + task=task, + data=data, + model_format=model_format, + use_intelex=use_intelex, + ) + + # create model file if it doesn't exist to make a valid repository + if not os.path.isfile(model_name): + if model_format == "pickle": + with open(model_name, "wb") as f: + pickle_dump(model, f) + elif model_format == "skops": + io.dump(model, model_name) + + # create README if it doesn't exist + if not os.path.isfile(dst / "README.md"): + model_card = card.Card(model, metadata=card.metadata_from_config(dst)) + model_card.save(dst / "README.md") + else: + raise ValueError( + "Cannot determine the input model argument. " + "Please indicate a model with the expected type." + ) + except Exception: shutil.rmtree(dst) raise diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index d2ee3931..d8e27c79 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -15,6 +15,7 @@ import sklearn from flaky import flaky from huggingface_hub import HfApi +from huggingface_hub.repocard import RepoCard from huggingface_hub.utils import RepositoryNotFoundError from sklearn.datasets import load_diabetes, load_iris from sklearn.linear_model import LinearRegression, LogisticRegression @@ -85,11 +86,13 @@ def classifier(repo_path, config_json): path = repo_path / model_file try: - if file_format == "pickle": + if file_format == "pickle" or file_format == "joblib": with open(path, "wb") as f: pickle.dump(clf, f) elif file_format == "skops": dump(clf, path) + elif file_format == "auto": + dump(clf, path) yield path finally: path.unlink(missing_ok=True) @@ -108,10 +111,22 @@ def classifier(repo_path, config_json): "model": {"file": "model.skops"}, } }, + "auto": { + "sklearn": { + "environment": ['scikit-learn="1.1.1"'], + "model": {"file": "model.skops"}, + } + }, + "joblib": { + "sklearn": { + "environment": ['scikit-learn="1.1.1"'], + "model": {"file": "model.joblib"}, + } + }, } -@pytest.fixture(scope="session", params=["skops", "pickle"]) +@pytest.fixture(scope="session", params=["skops", "pickle", "auto", "joblib"]) def config_json(repo_path, request): path = repo_path / "config.json" try: @@ -292,6 +307,8 @@ def test_init(classifier, config_json): ) _validate_folder(path=dir_path) + assert os.path.isfile(Path(dir_path) / "README.md") + # it should fail a second time since the folder is no longer empty. with pytest.raises(OSError, match="None-empty dst path already exists!"): init( @@ -303,6 +320,56 @@ def test_init(classifier, config_json): ) +@pytest.fixture( + params=[pytest.param("classifier", marks=pytest.mark.usefixtures), get_classifier()] +) +def classifiers(request): + # Returns a model object or a path to a model with all + # model formats combinations from CONFIG dict + try: + yield request.getfixturevalue(request.param) + except Exception: # get_classifier() is not a fixuture, exception raised + yield request.param + + +def test_override_init_modelcard(classifiers, config_json): + # create a temp directory and delete it, we just need a unique name. + dir_path = tempfile.mkdtemp() + shutil.rmtree(dir_path) + + version = metadata.version("scikit-learn") + _, model_format = config_json + # joblib type falls unders auto format, explicityly set to auto + # because we can't repeat key "auto" in CONFIG dict + if model_format == "joblib": + model_format = "auto" + + init( + model=classifiers, + requirements=[f'scikit-learn="{version}"'], + dst=dir_path, + task="tabular-classification", + data=iris.data, + model_format=model_format, + ) + _validate_folder(path=dir_path) + + # inital card does not have a license set + with pytest.raises( + AttributeError, match="'CardData' object has no attribute 'license'" + ): + model_card = RepoCard.load(Path(dir_path) / "README.md") + model_card.data.license + + # override existent modelcard created by init with license attribute + model = get_classifier() + model_card = card.Card(model, metadata=card.metadata_from_config(Path(dir_path))) + model_card.metadata.license = "mit" + model_card.save(Path(dir_path) / "README.md") + new_card = RepoCard.load(Path(dir_path) / "README.md") + assert new_card.data.license == "mit" + + def test_init_no_warning_or_error(classifier, config_json): config_path, file_format = config_json # for the happy path, there should be no warning