-
Notifications
You must be signed in to change notification settings - Fork 61
Generate minimal README.md file in repository initialization (issue #113) #207
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f5e83a9
28b4b0c
f0e9683
1aeb14c
95c0e1b
e0e6c7d
4b6cb73
870797f
d3a0eac
4b3fb8d
eaed93b
f182ee1
9a41cf2
7f7d0c2
1c19795
56165e4
6f99565
c265f50
0b6d3e2
5e1494a
5cfa962
35a30c2
0c4a66f
5436894
3c8e879
eec7a8d
1503850
0249d41
774ab96
1d23024
b420020
3b78b34
4e42408
af245b1
6138885
dd49d5c
21c3a5f
98e64b4
af9b3ac
ffdb1e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,12 +10,17 @@ | |
| import os | ||
| import shutil | ||
| from pathlib import Path | ||
| from pickle import dump as pickle_dump | ||
| from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union | ||
|
|
||
| import numpy as np | ||
| from huggingface_hub import HfApi, InferenceApi, snapshot_download | ||
| from sklearn.base import BaseEstimator | ||
| from sklearn.utils import check_array | ||
|
|
||
| from skops import card, io | ||
| from skops.card._model_card import _load_model | ||
|
|
||
| SUPPORTED_TASKS = [ | ||
| "tabular-classification", | ||
| "tabular-regression", | ||
|
|
@@ -405,8 +410,6 @@ def init( | |
| f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}" | ||
| ) | ||
|
|
||
| model = _check_model_file(model) | ||
|
|
||
| dst.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| # add intelex requirement, if it's used and not already in requirements | ||
|
|
@@ -415,19 +418,64 @@ def init( | |
| ): | ||
| requirements.append("scikit-learn-intelex") | ||
|
|
||
| # model parameter can be either a path or a model object | ||
| try: | ||
| shutil.copy2(src=model, dst=dst) | ||
|
|
||
| model_name = model.name | ||
| _create_config( | ||
| model_path=model_name, | ||
| requirements=requirements, | ||
| dst=dst, | ||
| task=task, | ||
| data=data, | ||
| model_format=model_format, | ||
| use_intelex=use_intelex, | ||
| ) | ||
| if isinstance(model, (str, Path)): | ||
| model = _check_model_file(model) | ||
| shutil.copy2(src=model, dst=dst) | ||
|
|
||
| model_name = model.name | ||
|
|
||
| _create_config( | ||
| model_path=model_name, | ||
| requirements=requirements, | ||
| dst=dst, | ||
| task=task, | ||
| data=data, | ||
| model_format=model_format, | ||
| use_intelex=use_intelex, | ||
| ) | ||
|
|
||
| # load model from file | ||
| model = _load_model(model, trusted=True) | ||
| model_card = card.Card(model, metadata=card.metadata_from_config(dst)) | ||
| model_card.save(dst / "README.md") | ||
| elif isinstance(model, BaseEstimator): | ||
| # if it is a model object and its format is set to auto, choose skops by default | ||
| if model_format == "auto": | ||
| model_format = "skops" | ||
| elif model_format in ["pkl", "pickle", "joblib"]: | ||
| model_format = "pickle" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need an Essentially, the model format checking logic should be the same as inside
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If so, I can remove that logic from
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think that change should be good. |
||
| model_name = Path(dst / f"model.{model_format}") | ||
|
|
||
| _create_config( | ||
| model_path=model_name, | ||
| requirements=requirements, | ||
| dst=dst, | ||
| task=task, | ||
| data=data, | ||
| model_format=model_format, | ||
| use_intelex=use_intelex, | ||
| ) | ||
|
|
||
| # create model file if it doesn't exist to make a valid repository | ||
| if not os.path.isfile(model_name): | ||
| if model_format == "pickle": | ||
| with open(model_name, "wb") as f: | ||
| pickle_dump(model, f) | ||
| elif model_format == "skops": | ||
| io.dump(model, model_name) | ||
|
|
||
| # create README if it doesn't exist | ||
| if not os.path.isfile(dst / "README.md"): | ||
| model_card = card.Card(model, metadata=card.metadata_from_config(dst)) | ||
| model_card.save(dst / "README.md") | ||
| else: | ||
| raise ValueError( | ||
| "Cannot determine the input model argument. " | ||
| "Please indicate a model with the expected type." | ||
| ) | ||
|
|
||
| except Exception: | ||
| shutil.rmtree(dst) | ||
| raise | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -15,6 +15,7 @@ | |||||
| import sklearn | ||||||
| from flaky import flaky | ||||||
| from huggingface_hub import HfApi | ||||||
| from huggingface_hub.repocard import RepoCard | ||||||
| from huggingface_hub.utils import RepositoryNotFoundError | ||||||
| from sklearn.datasets import load_diabetes, load_iris | ||||||
| from sklearn.linear_model import LinearRegression, LogisticRegression | ||||||
|
|
@@ -85,11 +86,13 @@ def classifier(repo_path, config_json): | |||||
| path = repo_path / model_file | ||||||
|
|
||||||
| try: | ||||||
| if file_format == "pickle": | ||||||
| if file_format == "pickle" or file_format == "joblib": | ||||||
| with open(path, "wb") as f: | ||||||
| pickle.dump(clf, f) | ||||||
| elif file_format == "skops": | ||||||
| dump(clf, path) | ||||||
| elif file_format == "auto": | ||||||
| dump(clf, path) | ||||||
| yield path | ||||||
| finally: | ||||||
| path.unlink(missing_ok=True) | ||||||
|
|
@@ -108,10 +111,22 @@ def classifier(repo_path, config_json): | |||||
| "model": {"file": "model.skops"}, | ||||||
| } | ||||||
| }, | ||||||
| "auto": { | ||||||
| "sklearn": { | ||||||
| "environment": ['scikit-learn="1.1.1"'], | ||||||
| "model": {"file": "model.skops"}, | ||||||
| } | ||||||
| }, | ||||||
| "joblib": { | ||||||
| "sklearn": { | ||||||
| "environment": ['scikit-learn="1.1.1"'], | ||||||
| "model": {"file": "model.joblib"}, | ||||||
| } | ||||||
| }, | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| @pytest.fixture(scope="session", params=["skops", "pickle"]) | ||||||
| @pytest.fixture(scope="session", params=["skops", "pickle", "auto", "joblib"]) | ||||||
| def config_json(repo_path, request): | ||||||
| path = repo_path / "config.json" | ||||||
| try: | ||||||
|
|
@@ -292,6 +307,8 @@ def test_init(classifier, config_json): | |||||
| ) | ||||||
| _validate_folder(path=dir_path) | ||||||
|
|
||||||
| assert os.path.isfile(Path(dir_path) / "README.md") | ||||||
|
|
||||||
| # it should fail a second time since the folder is no longer empty. | ||||||
| with pytest.raises(OSError, match="None-empty dst path already exists!"): | ||||||
| init( | ||||||
|
|
@@ -303,6 +320,56 @@ def test_init(classifier, config_json): | |||||
| ) | ||||||
|
|
||||||
|
|
||||||
| @pytest.fixture( | ||||||
| params=[pytest.param("classifier", marks=pytest.mark.usefixtures), get_classifier()] | ||||||
| ) | ||||||
| def classifiers(request): | ||||||
| # Returns a model object or a path to a model with all | ||||||
| # model formats combinations from CONFIG dict | ||||||
| try: | ||||||
| yield request.getfixturevalue(request.param) | ||||||
| except Exception: # get_classifier() is not a fixuture, exception raised | ||||||
| yield request.param | ||||||
|
|
||||||
|
|
||||||
| def test_override_init_modelcard(classifiers, config_json): | ||||||
| # create a temp directory and delete it, we just need a unique name. | ||||||
| dir_path = tempfile.mkdtemp() | ||||||
| shutil.rmtree(dir_path) | ||||||
|
merveenoyan marked this conversation as resolved.
|
||||||
|
|
||||||
| version = metadata.version("scikit-learn") | ||||||
| _, model_format = config_json | ||||||
| # joblib type falls unders auto format, explicityly set to auto | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| # because we can't repeat key "auto" in CONFIG dict | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I don't understand that, could you please explain further?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, that comment isn't self-explainable. The dictionary
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I guess it depends on why the other tests fail. If this uncovers a bug with existing code, that would be good to know. Otherwise, I think it's okay to have a separate test for joblib and not changing |
||||||
| if model_format == "joblib": | ||||||
| model_format = "auto" | ||||||
|
|
||||||
| init( | ||||||
| model=classifiers, | ||||||
| requirements=[f'scikit-learn="{version}"'], | ||||||
| dst=dir_path, | ||||||
| task="tabular-classification", | ||||||
| data=iris.data, | ||||||
| model_format=model_format, | ||||||
| ) | ||||||
| _validate_folder(path=dir_path) | ||||||
|
|
||||||
| # inital card does not have a license set | ||||||
| with pytest.raises( | ||||||
| AttributeError, match="'CardData' object has no attribute 'license'" | ||||||
| ): | ||||||
| model_card = RepoCard.load(Path(dir_path) / "README.md") | ||||||
| model_card.data.license | ||||||
|
|
||||||
| # override existent modelcard created by init with license attribute | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my better understanding, is this testing some new behavior added by this PR or some general behavior?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, this is not a new behavior added by this PR. Now that you call this out I see that maybe we can make this test simpler by checking that the README.md exists after the call to
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, that sounds like it is sufficient as a test. |
||||||
| model = get_classifier() | ||||||
| model_card = card.Card(model, metadata=card.metadata_from_config(Path(dir_path))) | ||||||
| model_card.metadata.license = "mit" | ||||||
| model_card.save(Path(dir_path) / "README.md") | ||||||
| new_card = RepoCard.load(Path(dir_path) / "README.md") | ||||||
| assert new_card.data.license == "mit" | ||||||
|
|
||||||
|
|
||||||
| def test_init_no_warning_or_error(classifier, config_json): | ||||||
| config_path, file_format = config_json | ||||||
| # for the happy path, there should be no warning | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we need no check for
BaseEstimatorhere, so this can just beelseand the error below can be removed. There could be valid models here that don't inherit fromBaseEstimator. It is the user's responsibility to provide an sklearn-compatible model.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, got it!