skops-dev · jucamohedano · Nov 1, 2022 · Nov 2, 2022 · Nov 2, 2022 · Nov 3, 2022
diff --git a/examples/plot_intelex.py b/examples/plot_intelex.py
@@ -150,7 +150,7 @@
 
 with NamedTemporaryFile(mode="bw", prefix="stock-", suffix=".pkl") as fp:
     pickle.dump(clf, file=fp)
-
+    fp.seek(0)
     stock_repo = mkdtemp(prefix="stock-")
     hub_utils.init(
         model=fp.name,
@@ -163,7 +163,7 @@
 
 with NamedTemporaryFile(mode="bw", prefix="opt-", suffix=".pkl") as fp:
     pickle.dump(clf_opt, file=fp)
-
+    fp.seek(0)
     opt_repo = mkdtemp(prefix="opt-")
     hub_utils.init(
         model=fp.name,

diff --git a/examples/plot_tabular_regression.py b/examples/plot_tabular_regression.py
@@ -63,7 +63,7 @@
 # Initialize a repository to save our files in
 # ============================================
 # We will now initialize a repository and save our model
-_, pkl_name = mkstemp(prefix="skops-", suffix=".pkl")
+_, pkl_name = mkstemp(prefix="skops-", suffix=".skops")
 
 with open(pkl_name, mode="bw") as f:
     sio.dump(model, file=f)

diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
@@ -10,12 +10,17 @@
 import os
 import shutil
 from pathlib import Path
+from pickle import dump as pickle_dump
 from typing import Any, List, Literal, MutableMapping, Optional, Sequence, Union
 
 import numpy as np
 from huggingface_hub import HfApi, InferenceApi, snapshot_download
+from sklearn.base import BaseEstimator
 from sklearn.utils import check_array
 
+from skops import card, io
+from skops.card._model_card import _load_model
+
 SUPPORTED_TASKS = [
     "tabular-classification",
     "tabular-regression",
@@ -405,8 +410,6 @@ def init(
             f"Task {task} not supported. Supported tasks are: {SUPPORTED_TASKS}"
         )
 
-    model = _check_model_file(model)
-
     dst.mkdir(parents=True, exist_ok=True)
 
     # add intelex requirement, if it's used and not already in requirements
@@ -415,19 +418,64 @@ def init(
     ):
         requirements.append("scikit-learn-intelex")
 
+    # model parameter can be either a path or a model object
     try:
-        shutil.copy2(src=model, dst=dst)
-
-        model_name = model.name
-        _create_config(
-            model_path=model_name,
-            requirements=requirements,
-            dst=dst,
-            task=task,
-            data=data,
-            model_format=model_format,
-            use_intelex=use_intelex,
-        )
+        if isinstance(model, (str, Path)):
+            model = _check_model_file(model)
+            shutil.copy2(src=model, dst=dst)
+
+            model_name = model.name
+
+            _create_config(
+                model_path=model_name,
+                requirements=requirements,
+                dst=dst,
+                task=task,
+                data=data,
+                model_format=model_format,
+                use_intelex=use_intelex,
+            )
+
+            # load model from file
+            model = _load_model(model, trusted=True)
+            model_card = card.Card(model, metadata=card.metadata_from_config(dst))
+            model_card.save(dst / "README.md")
+        elif isinstance(model, BaseEstimator):
+            # if it is a model object and its format is set to auto, choose skops by default
+            if model_format == "auto":
+                model_format = "skops"
+            elif model_format in ["pkl", "pickle", "joblib"]:
+                model_format = "pickle"
+            model_name = Path(dst / f"model.{model_format}")
+
+            _create_config(
+                model_path=model_name,
+                requirements=requirements,
+                dst=dst,
+                task=task,
+                data=data,
+                model_format=model_format,
+                use_intelex=use_intelex,
+            )
+
+            # create model file if it doesn't exist to make a valid repository
+            if not os.path.isfile(model_name):
+                if model_format == "pickle":
+                    with open(model_name, "wb") as f:
+                        pickle_dump(model, f)
+                elif model_format == "skops":
+                    io.dump(model, model_name)
+
+            # create README if it doesn't exist
+            if not os.path.isfile(dst / "README.md"):
+                model_card = card.Card(model, metadata=card.metadata_from_config(dst))
+                model_card.save(dst / "README.md")
+        else:
+            raise ValueError(
+                "Cannot determine the input model argument. "
+                "Please indicate a model with the expected type."
+            )
+
     except Exception:
         shutil.rmtree(dst)
         raise

diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py
@@ -15,6 +15,7 @@
 import sklearn
 from flaky import flaky
 from huggingface_hub import HfApi
+from huggingface_hub.repocard import RepoCard
 from huggingface_hub.utils import RepositoryNotFoundError
 from sklearn.datasets import load_diabetes, load_iris
 from sklearn.linear_model import LinearRegression, LogisticRegression
@@ -85,11 +86,13 @@ def classifier(repo_path, config_json):
     path = repo_path / model_file
 
     try:
-        if file_format == "pickle":
+        if file_format == "pickle" or file_format == "joblib":
             with open(path, "wb") as f:
                 pickle.dump(clf, f)
         elif file_format == "skops":
             dump(clf, path)
+        elif file_format == "auto":
+            dump(clf, path)
         yield path
     finally:
         path.unlink(missing_ok=True)
@@ -108,10 +111,22 @@ def classifier(repo_path, config_json):
             "model": {"file": "model.skops"},
         }
     },
+    "auto": {
+        "sklearn": {
+            "environment": ['scikit-learn="1.1.1"'],
+            "model": {"file": "model.skops"},
+        }
+    },
+    "joblib": {
+        "sklearn": {
+            "environment": ['scikit-learn="1.1.1"'],
+            "model": {"file": "model.joblib"},
+        }
+    },
 }
 
 
-@pytest.fixture(scope="session", params=["skops", "pickle"])
+@pytest.fixture(scope="session", params=["skops", "pickle", "auto", "joblib"])
 def config_json(repo_path, request):
     path = repo_path / "config.json"
     try:
@@ -292,6 +307,8 @@ def test_init(classifier, config_json):
     )
     _validate_folder(path=dir_path)
 
+    assert os.path.isfile(Path(dir_path) / "README.md")
+
     # it should fail a second time since the folder is no longer empty.
     with pytest.raises(OSError, match="None-empty dst path already exists!"):
         init(
@@ -303,6 +320,56 @@ def test_init(classifier, config_json):
         )
 
 
+@pytest.fixture(
+    params=[pytest.param("classifier", marks=pytest.mark.usefixtures), get_classifier()]
+)
+def classifiers(request):
+    # Returns a model object or a path to a model with all
+    # model formats combinations from CONFIG dict
+    try:
+        yield request.getfixturevalue(request.param)
+    except Exception:  # get_classifier() is not a fixuture, exception raised
+        yield request.param
+
+
+def test_override_init_modelcard(classifiers, config_json):
+    # create a temp directory and delete it, we just need a unique name.
+    dir_path = tempfile.mkdtemp()
+    shutil.rmtree(dir_path)
+
+    version = metadata.version("scikit-learn")
+    _, model_format = config_json
+    # joblib type falls unders auto format, explicityly set to auto
-    # joblib type falls unders auto format, explicityly set to auto
+    # joblib type falls under auto format, explicitly set to auto
-    # joblib type falls unders auto format, explicityly set to auto
+    # joblib type falls under auto format, explicitly set to auto
+    # because we can't repeat key "auto" in CONFIG dict
+    if model_format == "joblib":
+        model_format = "auto"
+
+    init(
+        model=classifiers,
+        requirements=[f'scikit-learn="{version}"'],
+        dst=dir_path,
+        task="tabular-classification",
+        data=iris.data,
+        model_format=model_format,
+    )
+    _validate_folder(path=dir_path)
+
+    # inital card does not have a license set
+    with pytest.raises(
+        AttributeError, match="'CardData' object has no attribute 'license'"
+    ):
+        model_card = RepoCard.load(Path(dir_path) / "README.md")
+        model_card.data.license
+
+    # override existent modelcard created by init with license attribute
+    model = get_classifier()
+    model_card = card.Card(model, metadata=card.metadata_from_config(Path(dir_path)))
+    model_card.metadata.license = "mit"
+    model_card.save(Path(dir_path) / "README.md")
+    new_card = RepoCard.load(Path(dir_path) / "README.md")
+    assert new_card.data.license == "mit"
+
+
 def test_init_no_warning_or_error(classifier, config_json):
     config_path, file_format = config_json
     # for the happy path, there should be no warning