Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ v0.4
pattern, like you'd do with ``pickle``. :pr:`234` by `Benjamin Bossan`_.
- All `scikit-learn` estimators are trusted by default.
:pr:`237` by :user:`Edoardo Abati <EdAbati>`.
- Add `model_format` argument to :meth:`skops.hub_utils.init` to be stored in
`config.json` so that we know how to load a model from the repository.
:pr:`242` by `Merve Noyan`_.
Comment on lines +19 to +21
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I didn't already know, I would be confused about what this new feature does. Maybe we should avoid the word "we" in the changes.rst and instead use more neutral language.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you suggest here @BenjaminBossan

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a blocker, so feel free to merge.

Maybe something like:

Add model_format argument to :meth:skops.hub_utils.init to be stored in
config.json, which is used to determine the format of the model file
(pickle or skops).



v0.3
----
Expand Down
34 changes: 34 additions & 0 deletions skops/hub_utils/_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ def _create_config(
"text-regression",
],
data,
model_format: Literal[ # type: ignore
"skops",
"pickle",
"auto",
] = "auto",
) -> None:
"""Write the configuration into a ``config.json`` file.

Expand Down Expand Up @@ -185,6 +190,13 @@ def _create_config(

The first 3 input values are used as example inputs.

model_format: str
Comment thread
merveenoyan marked this conversation as resolved.
The format used to persist the model. Can be ``"auto"``, ``"skops"``
or ``"pickle"``. Defaults to ``"auto"``, which would mean:

- ``"pickle"`` if the extension is one of ``{".pickle", ".pkl", ".joblib"}``
- ``"skops"`` if the extension is ``".skops"``
Comment thread
merveenoyan marked this conversation as resolved.

Returns
-------
None
Expand All @@ -195,10 +207,22 @@ def _create_config(
def recursively_default_dict() -> MutableMapping:
return collections.defaultdict(recursively_default_dict)

if model_format == "auto":
extension = Path(model_path).suffix
if extension in [".pkl", ".pickle", ".joblib"]:
model_format = "pickle"
elif extension == ".skops":
model_format = "skops"
if model_format not in ["skops", "pickle"]:
raise ValueError(
"Cannot determine the input file format. Please indicate the format using"
" the `model_format` argument."
)
config = recursively_default_dict()
config["sklearn"]["model"]["file"] = str(model_path)
config["sklearn"]["environment"] = requirements
config["sklearn"]["task"] = task
config["sklearn"]["model_format"] = model_format

if "tabular" in task:
config["sklearn"]["example_input"] = _get_example_input(data)
Expand Down Expand Up @@ -251,6 +275,11 @@ def init(
"text-regression",
],
data,
model_format: Literal[ # type: ignore
"skops",
"pickle",
"auto",
] = "auto",
) -> None:
"""Initialize a scikit-learn based Hugging Face repo.

Expand Down Expand Up @@ -291,6 +320,10 @@ def init(
:class:`numpy.ndarray`. If ``task`` is ``"text-classification"`` or
``"text-regression"``, the data needs to be a ``list`` of strings.

model_format: str
The format the model was persisted in. Can be ``"auto"``, ``"skops"``
or ``"pickle"``. Defaults to ``"auto"`` that relies on file extension.
Comment thread
merveenoyan marked this conversation as resolved.

Returns
-------
None
Expand Down Expand Up @@ -318,6 +351,7 @@ def init(
dst=dst,
task=task,
data=data,
model_format=model_format,
)
except Exception:
shutil.rmtree(dst)
Expand Down
116 changes: 80 additions & 36 deletions skops/hub_utils/tests/test_hf_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
_validate_folder,
)
from skops.hub_utils.tests.common import HF_HUB_TOKEN
from skops.io import dump
from skops.utils.fixes import metadata, path_unlink

iris = load_iris(as_frame=True, return_X_y=False)
Expand Down Expand Up @@ -75,39 +76,67 @@ def get_regressor():


@pytest.fixture(scope="session")
def classifier_pickle(repo_path):
# Create a simple pickle file for the purpose of testing
def classifier(repo_path, config_json):
# Create a simple model file for the purpose of testing
clf = get_classifier()
path = repo_path / "model.pickle"
config_path, file_format = config_json
model_file = CONFIG[file_format]["sklearn"]["model"]["file"]
path = repo_path / model_file

try:
with open(path, "wb") as f:
pickle.dump(clf, f)
if file_format == "pickle":
with open(path, "wb") as f:
pickle.dump(clf, f)
elif file_format == "skops":
dump(clf, path)
yield path
finally:
path_unlink(path, missing_ok=True)


CONFIG = {
"sklearn": {
"environment": ['scikit-learn="1.1.1"'],
"model": {"file": "model.pickle"},
}
"pickle": {
"sklearn": {
"environment": ['scikit-learn="1.1.1"'],
"model": {"file": "model.pickle"},
}
},
"skops": {
"sklearn": {
"environment": ['scikit-learn="1.1.1"'],
"model": {"file": "model.skops"},
}
},
}


@pytest.fixture(scope="session")
def config_json(repo_path):
@pytest.fixture(scope="session", params=["skops", "pickle"])
def config_json(repo_path, request):
path = repo_path / "config.json"
try:
with open(path, "w") as f:
json.dump(CONFIG, f)
yield path
json.dump(CONFIG[request.param], f)
yield path, request.param
finally:
path_unlink(path, missing_ok=True)


def test_validate_format(classifier):
dir_path = tempfile.mkdtemp()
shutil.rmtree(dir_path)
with pytest.raises(ValueError, match="Cannot determine the input file*"):
init(
model=classifier,
requirements=["scikit-learn"],
dst=dir_path,
task="tabular-classification",
data=iris.data,
model_format="dummy",
)


def test_validate_folder(config_json):
config_path, file_format = config_json
_, file_path = tempfile.mkstemp()
dir_path = tempfile.mkdtemp()
with pytest.raises(TypeError, match="The given path is not a directory."):
Expand All @@ -124,11 +153,12 @@ def test_validate_folder(config_json):
):
_validate_folder(path=dir_path)

shutil.copy2(config_json, dir_path)
with pytest.raises(TypeError, match="Model file model.pickle does not exist."):
shutil.copy2(config_path, dir_path)
model_file = CONFIG[file_format]["sklearn"]["model"]["file"]
with pytest.raises(TypeError, match=f"Model file {model_file} does not exist."):
_validate_folder(path=dir_path)

(Path(dir_path) / "model.pickle").touch()
(Path(dir_path) / model_file).touch()

# this should now work w/o an error
_validate_folder(path=dir_path)
Expand Down Expand Up @@ -210,11 +240,11 @@ def test_create_config_invalid_text_data(temp_path):
)


def test_atomic_init(classifier_pickle, temp_path):
def test_atomic_init(classifier, temp_path):
with pytest.raises(ValueError):
# this fails since we're passing an invalid task.
init(
model=classifier_pickle,
model=classifier,
requirements=["scikit-learn"],
dst=temp_path,
task="tabular-classification",
Expand All @@ -224,35 +254,36 @@ def test_atomic_init(classifier_pickle, temp_path):
# this passes even though the above init has failed once, on the same
# destination path.
init(
model=classifier_pickle,
model=classifier,
requirements=["scikit-learn"],
dst=temp_path,
task="tabular-classification",
data=iris.data,
)


def test_init_invalid_task(classifier_pickle, temp_path):
def test_init_invalid_task(classifier, temp_path):
with pytest.raises(
ValueError, match="Task invalid not supported. Supported tasks are"
):
init(
model=classifier_pickle,
model=classifier,
requirements=["scikit-learn"],
dst=temp_path,
task="invalid",
data=iris.data,
)


def test_init(classifier_pickle, config_json):
def test_init(classifier, config_json):
config_path, file_format = config_json
# create a temp directory and delete it, we just need a unique name.
dir_path = tempfile.mkdtemp()
shutil.rmtree(dir_path)

version = metadata.version("scikit-learn")
init(
model=classifier_pickle,
model=classifier,
requirements=[f'scikit-learn="{version}"'],
dst=dir_path,
task="tabular-classification",
Expand All @@ -263,15 +294,16 @@ def test_init(classifier_pickle, config_json):
# it should fail a second time since the folder is no longer empty.
with pytest.raises(OSError, match="None-empty dst path already exists!"):
init(
model=classifier_pickle,
model=classifier,
requirements=[f'scikit-learn="{version}"'],
dst=dir_path,
task="tabular-classification",
data=iris.data,
)


def test_init_no_warning_or_error(classifier_pickle, config_json):
def test_init_no_warning_or_error(classifier, config_json):
config_path, file_format = config_json
# for the happy path, there should be no warning
dir_path = tempfile.mkdtemp()
shutil.rmtree(dir_path)
Expand All @@ -280,7 +312,7 @@ def test_init_no_warning_or_error(classifier_pickle, config_json):
with warnings.catch_warnings():
warnings.simplefilter("error")
init(
model=classifier_pickle,
model=classifier,
requirements=[f'scikit-learn="{version}"'],
dst=dir_path,
task="tabular-classification",
Expand All @@ -289,6 +321,7 @@ def test_init_no_warning_or_error(classifier_pickle, config_json):


def test_model_file_does_not_exist_raises(repo_path, config_json):
config_path, file_format = config_json
# when the model file does not exist, raise an OSError
model_path = repo_path / "foobar.pickle"
dir_path = tempfile.mkdtemp()
Expand All @@ -308,6 +341,7 @@ def test_model_file_does_not_exist_raises(repo_path, config_json):


def test_init_empty_model_file_errors(repo_path, config_json):
config_path, file_format = config_json
# when model file is empty, warn users
model_path = Path(repo_path / "foobar.pickle")
model_path.touch()
Expand Down Expand Up @@ -336,14 +370,15 @@ def test_push_download(
explicit_create,
repo_path,
destination_path,
classifier_pickle,
classifier,
config_json,
):
config_path, file_format = config_json
client = HfApi()

version = metadata.version("scikit-learn")
init(
model=classifier_pickle,
model=classifier,
requirements=[f'scikit-learn="{version}"'],
dst=destination_path,
task="tabular-classification",
Expand Down Expand Up @@ -373,7 +408,7 @@ def test_push_download(
download(repo_id=repo_id, dst=destination_path, token=HF_HUB_TOKEN)

files = client.list_repo_files(repo_id=repo_id, use_auth_token=HF_HUB_TOKEN)
for f_name in [classifier_pickle.name, config_json.name]:
for f_name in [classifier.name, config_path.name]:
assert f_name in files

try:
Expand Down Expand Up @@ -408,6 +443,7 @@ def repo_path_for_inference():
ids=["classifier", "regressor"],
)
def test_inference(
config_json,
model_func,
data,
task,
Expand All @@ -416,11 +452,17 @@ def test_inference(
):
# test inference backend for classifier and regressor models. This test can
# take a lot of time and be flaky.
client = HfApi()
config_path, file_format = config_json
if file_format != "pickle":
pytest.skip(
f"Inference only supports pickle at the moment. Given format: {file_format}"
)

client = HfApi()
repo_path = repo_path_for_inference
model_file = CONFIG[file_format]["sklearn"]["model"]["file"]
model = model_func()
model_path = repo_path / "model.pickle"
model_path = repo_path / model_file

with open(model_path, "wb") as f:
pickle.dump(model, f)
Expand All @@ -434,7 +476,7 @@ def test_inference(
data=data.data,
)

# a model card is needed for inference engine to work.
# TODO: remove when card init at repo init is merged
model_card = card.Card(
model, metadata=card.metadata_from_config(Path(destination_path))
)
Expand Down Expand Up @@ -464,9 +506,11 @@ def test_inference(
assert np.allclose(output, y_pred)


def test_get_config(repo_path):
def test_get_config(repo_path, config_json):
config_path, file_format = config_json
config = get_config(repo_path)
assert config == CONFIG

assert config == CONFIG[file_format]
assert get_requirements(repo_path) == ['scikit-learn="1.1.1"']


Expand Down Expand Up @@ -528,14 +572,14 @@ def test_get_column_names_pandas_not_installed(pandas_not_installed):

class TestAddFiles:
@pytest.fixture
def init_path(self, classifier_pickle, config_json):
def init_path(self, classifier, config_json):
# create temporary directory
dir_path = tempfile.mkdtemp()
shutil.rmtree(dir_path)

version = metadata.version("scikit-learn")
init(
model=classifier_pickle,
model=classifier,
requirements=[f'scikit-learn="{version}"'],
dst=dir_path,
task="tabular-classification",
Expand Down