Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
f5e83a9
feat: generate README.md in hub_utils.init
jucamohedano Nov 1, 2022
28b4b0c
Merge branch 'skops-dev:main' into main
jucamohedano Nov 2, 2022
f0e9683
ref: replace _create_readme function with fewer lines
jucamohedano Nov 2, 2022
1aeb14c
test create model card in hub_utils.init
jucamohedano Nov 3, 2022
95c0e1b
test override model card after created by hub_utils.init
jucamohedano Nov 3, 2022
e0e6c7d
Merge branch 'skops-dev:main' into main
jucamohedano Nov 14, 2022
4b6cb73
ref: deduplicate test creation of README in init
jucamohedano Nov 14, 2022
870797f
fix: check that content of new model card is modified
jucamohedano Nov 14, 2022
d3a0eac
Merge branch 'skops-dev:main' into main
jucamohedano Nov 14, 2022
4b3fb8d
Merge branch 'main' of github.com:jucamohedano/skops into main
jucamohedano Nov 14, 2022
eaed93b
Merge branch 'skops-dev:main' into main
jucamohedano Nov 18, 2022
f182ee1
revert lines removed by mistake
jucamohedano Nov 18, 2022
9a41cf2
Merge branch 'main' into main
adrinjalali Nov 22, 2022
7f7d0c2
Merge branch 'skops-dev:main' into main
jucamohedano Nov 25, 2022
1c19795
Merge branch 'skops-dev:main' into main
jucamohedano Dec 4, 2022
56165e4
Merge branch 'skops-dev:main' into main
jucamohedano Dec 5, 2022
6f99565
Merge branch 'main' into main
jucamohedano Jan 19, 2023
c265f50
Merge branch 'main' into main
jucamohedano Jan 21, 2023
0b6d3e2
fix: check model format of model file
jucamohedano Jan 21, 2023
5e1494a
fix: run pre-commit on all files
jucamohedano Jan 23, 2023
5cfa962
Merge branch 'skops-dev:main' into main
jucamohedano Jan 23, 2023
35a30c2
Merge branch 'skops-dev:main' into main
jucamohedano Jan 31, 2023
0c4a66f
fix: check for file suffix to determine format
jucamohedano Jan 31, 2023
5436894
Merge branch 'skops-dev:main' into main
jucamohedano Feb 12, 2023
3c8e879
Merge branch 'skops-dev:main' into main
jucamohedano Feb 25, 2023
eec7a8d
fix: check for model_format and suffix if auto
jucamohedano Mar 1, 2023
1503850
fix: use model_format argument in test and create new parameter for auto
jucamohedano Mar 1, 2023
0249d41
fix: correct file format
jucamohedano Mar 1, 2023
774ab96
Merge remote-tracking branch 'skops/main' into main
jucamohedano Mar 7, 2023
1d23024
fix: argument type
jucamohedano Mar 7, 2023
b420020
fix reading file in example
jucamohedano May 1, 2023
3b78b34
fix: add trusted argument to io.load when checking file extension
jucamohedano May 1, 2023
4e42408
fix: add support for model parameter in init based on its type; updat…
jucamohedano May 26, 2023
af245b1
Merge remote-tracking branch 'skops-upstream/main' into main
jucamohedano May 26, 2023
6138885
fix: revert files changes
jucamohedano May 26, 2023
dd49d5c
change file format in example to skops
jucamohedano May 26, 2023
21c3a5f
fix: revert to always create README if model is str/Path and handle i…
jucamohedano Jun 2, 2023
98e64b4
replace open file block of code with _load_model function
jucamohedano Jul 12, 2023
af9b3ac
Merge remote-tracking branch 'skops-upstream/main' into HEAD
jucamohedano Jul 12, 2023
ffdb1e8
trust types in model file
jucamohedano Jul 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/plot_intelex.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@

with NamedTemporaryFile(mode="bw", prefix="stock-", suffix=".pkl") as fp:
pickle.dump(clf, file=fp)

fp.seek(0)
stock_repo = mkdtemp(prefix="stock-")
hub_utils.init(
model=fp.name,
Expand All @@ -163,7 +163,7 @@

with NamedTemporaryFile(mode="bw", prefix="opt-", suffix=".pkl") as fp:
pickle.dump(clf_opt, file=fp)

fp.seek(0)
opt_repo = mkdtemp(prefix="opt-")
hub_utils.init(
model=fp.name,
Expand Down
2 changes: 1 addition & 1 deletion examples/plot_tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
# Initialize a repository to save our files in
# ============================================
# We will now initialize a repository and save our model
_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl")
_, pkl_name = mkstemp(prefix="skops-", suffix=".skops")

with open(pkl_name, mode="bw") as f:
sio.dump(model, file=f)
Expand Down
76 changes: 62 additions & 14 deletions skops/hub_utils/_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,17 @@
import os
import shutil
from pathlib import Path
from pickle import dump as pickle_dump
from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union

import numpy as np
from huggingface_hub import HfApi, InferenceApi, snapshot_download
from sklearn.base import BaseEstimator
from sklearn.utils import check_array

from skops import card, io
from skops.card._model_card import _load_model

SUPPORTED_TASKS = [
"tabular-classification",
"tabular-regression",
Expand Down Expand Up @@ -405,8 +410,6 @@ def init(
f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}"
)

model = _check_model_file(model)

dst.mkdir(parents=True, exist_ok=True)

# add intelex requirement, if it's used and not already in requirements
Expand All @@ -415,19 +418,64 @@ def init(
):
requirements.append("scikit-learn-intelex")

# model parameter can be either a path or a model object
try:
shutil.copy2(src=model, dst=dst)

model_name = model.name
_create_config(
model_path=model_name,
requirements=requirements,
dst=dst,
task=task,
data=data,
model_format=model_format,
use_intelex=use_intelex,
)
if isinstance(model, (str, Path)):
model = _check_model_file(model)
shutil.copy2(src=model, dst=dst)

model_name = model.name

_create_config(
model_path=model_name,
requirements=requirements,
dst=dst,
task=task,
data=data,
model_format=model_format,
use_intelex=use_intelex,
)

# load model from file
model = _load_model(model, trusted=True)
model_card = card.Card(model, metadata=card.metadata_from_config(dst))
model_card.save(dst / "README.md")
elif isinstance(model, BaseEstimator):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need no check for BaseEstimator here, so this can just be else and the error below can be removed. There could be valid models here that don't inherit from BaseEstimator. It is the user's responsibility to provide an sklearn-compatible model.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, got it!

# if it is a model object and its format is set to auto, choose skops by default
if model_format == "auto":
model_format = "skops"
elif model_format in ["pkl", "pickle", "joblib"]:
model_format = "pickle"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need an else clause below because if the user passes model_format="skosp" or another typo, that would just be accepted as is, even though it is invalid. Then further below, the model would not be saved without any indication that something went wrong.

Essentially, the model format checking logic should be the same as inside _create_config. And since checking the model format is new performed in init, we should also be able to remove it from _create_config completely.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If so, I can remove that logic from _create_config and have it just in init. I actually started this PR by looking at _create_config. Let me know if you would like to make this change now.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think that change should be good.

model_name = Path(dst / f"model.{model_format}")

_create_config(
model_path=model_name,
requirements=requirements,
dst=dst,
task=task,
data=data,
model_format=model_format,
use_intelex=use_intelex,
)

# create model file if it doesn't exist to make a valid repository
if not os.path.isfile(model_name):
if model_format == "pickle":
with open(model_name, "wb") as f:
pickle_dump(model, f)
elif model_format == "skops":
io.dump(model, model_name)

# create README if it doesn't exist
if not os.path.isfile(dst / "README.md"):
model_card = card.Card(model, metadata=card.metadata_from_config(dst))
model_card.save(dst / "README.md")
else:
raise ValueError(
"Cannot determine the input model argument. "
"Please indicate a model with the expected type."
)

except Exception:
shutil.rmtree(dst)
raise
Expand Down
71 changes: 69 additions & 2 deletions skops/hub_utils/tests/test_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import sklearn
from flaky import flaky
from huggingface_hub import HfApi
from huggingface_hub.repocard import RepoCard
from huggingface_hub.utils import RepositoryNotFoundError
from sklearn.datasets import load_diabetes, load_iris
from sklearn.linear_model import LinearRegression, LogisticRegression
Expand Down Expand Up @@ -85,11 +86,13 @@ def classifier(repo_path, config_json):
path = repo_path / model_file

try:
if file_format == "pickle":
if file_format == "pickle" or file_format == "joblib":
with open(path, "wb") as f:
pickle.dump(clf, f)
elif file_format == "skops":
dump(clf, path)
elif file_format == "auto":
dump(clf, path)
yield path
finally:
path.unlink(missing_ok=True)
Expand All @@ -108,10 +111,22 @@ def classifier(repo_path, config_json):
"model": {"file": "model.skops"},
}
},
"auto": {
"sklearn": {
"environment": ['scikit-learn="1.1.1"'],
"model": {"file": "model.skops"},
}
},
"joblib": {
"sklearn": {
"environment": ['scikit-learn="1.1.1"'],
"model": {"file": "model.joblib"},
}
},
}


@pytest.fixture(scope="session", params=["skops", "pickle"])
@pytest.fixture(scope="session", params=["skops", "pickle", "auto", "joblib"])
def config_json(repo_path, request):
path = repo_path / "config.json"
try:
Expand Down Expand Up @@ -292,6 +307,8 @@ def test_init(classifier, config_json):
)
_validate_folder(path=dir_path)

assert os.path.isfile(Path(dir_path) / "README.md")

# it should fail a second time since the folder is no longer empty.
with pytest.raises(OSError, match="None-empty dst path already exists!"):
init(
Expand All @@ -303,6 +320,56 @@ def test_init(classifier, config_json):
)


@pytest.fixture(
params=[pytest.param("classifier", marks=pytest.mark.usefixtures), get_classifier()]
)
def classifiers(request):
# Returns a model object or a path to a model with all
# model formats combinations from CONFIG dict
try:
yield request.getfixturevalue(request.param)
except Exception: # get_classifier() is not a fixuture, exception raised
yield request.param


def test_override_init_modelcard(classifiers, config_json):
# create a temp directory and delete it, we just need a unique name.
dir_path = tempfile.mkdtemp()
shutil.rmtree(dir_path)
Comment thread
merveenoyan marked this conversation as resolved.

version = metadata.version("scikit-learn")
_, model_format = config_json
# joblib type falls unders auto format, explicityly set to auto
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# joblib type falls unders auto format, explicityly set to auto
# joblib type falls under auto format, explicitly set to auto

# because we can't repeat key "auto" in CONFIG dict
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I don't understand that, could you please explain further?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, that comment isn't self-explainable. The dictionary CONFIG in test_hf_hub contains 3 types of models. I wanted to test for a model with name model.joblib, so I added that model to the dictionary. If I recall correctly, adding the key:value pair for the joblib model made other tests fail. But I could fix this by changing the joblib model type auto and still have the name model.joblib to test for.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I guess it depends on why the other tests fail. If this uncovers a bug with existing code, that would be good to know. Otherwise, I think it's okay to have a separate test for joblib and not changing CONFIG.

if model_format == "joblib":
model_format = "auto"

init(
model=classifiers,
requirements=[f'scikit-learn="{version}"'],
dst=dir_path,
task="tabular-classification",
data=iris.data,
model_format=model_format,
)
_validate_folder(path=dir_path)

# inital card does not have a license set
with pytest.raises(
AttributeError, match="'CardData' object has no attribute 'license'"
):
model_card = RepoCard.load(Path(dir_path) / "README.md")
model_card.data.license

# override existent modelcard created by init with license attribute
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For my better understanding, is this testing some new behavior added by this PR or some general behavior?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this is not a new behavior added by this PR. Now that you call this out I see that maybe we can make this test simpler by checking that the README.md exists after the call to init and that's it? I think that at the time of writing the test, I decided to update the README.md as a way of checking that it exists.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can make this test simpler by checking that the README.md exists after the call to init and that's it?

Yes, that sounds like it is sufficient as a test.

model = get_classifier()
model_card = card.Card(model, metadata=card.metadata_from_config(Path(dir_path)))
model_card.metadata.license = "mit"
model_card.save(Path(dir_path) / "README.md")
new_card = RepoCard.load(Path(dir_path) / "README.md")
assert new_card.data.license == "mit"


def test_init_no_warning_or_error(classifier, config_json):
config_path, file_format = config_json
# for the happy path, there should be no warning
Expand Down