skops-dev · BenjaminBossan · Dec 13, 2022 · Dec 5, 2022 · Dec 7, 2022 · Dec 9, 2022
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -16,6 +16,10 @@ v0.4
   pattern, like you'd do with ``pickle``. :pr:`234` by `Benjamin Bossan`_.
 - All `scikit-learn` estimators are trusted by default.
   :pr:`237` by :user:`Edoardo Abati <EdAbati>`.
+- Add `model_format` argument to :meth:`skops.hub_utils.init` to be stored in
+  `config.json` so that we know how to load a model from the repository.
+  :pr:`242` by `Merve Noyan`_.
+
 
 v0.3
 ----

diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py
@@ -151,6 +151,11 @@ def _create_config(
         "text-regression",
     ],
     data,
+    model_format: Literal[  # type: ignore
+        "skops",
+        "pickle",
+        "auto",
+    ] = "auto",
 ) -> None:
     """Write the configuration into a ``config.json`` file.
 
@@ -185,6 +190,13 @@ def _create_config(
 
         The first 3 input values are used as example inputs.
 
+    model_format: str
+        The format used to persist the model. Can be ``"auto"``, ``"skops"``
+        or ``"pickle"``. Defaults to ``"auto"``, which would mean:
+
+        - ``"pickle"`` if the extension is one of ``{".pickle", ".pkl", ".joblib"}``
+        - ``"skops"`` if the extension is ``".skops"``
+
     Returns
     -------
     None
@@ -195,10 +207,22 @@ def _create_config(
     def recursively_default_dict() -> MutableMapping:
         return collections.defaultdict(recursively_default_dict)
 
+    if model_format == "auto":
+        extension = Path(model_path).suffix
+        if extension in [".pkl", ".pickle", ".joblib"]:
+            model_format = "pickle"
+        elif extension == ".skops":
+            model_format = "skops"
+    if model_format not in ["skops", "pickle"]:
+        raise ValueError(
+            "Cannot determine the input file format. Please indicate the format using"
+            " the `model_format` argument."
+        )
     config = recursively_default_dict()
     config["sklearn"]["model"]["file"] = str(model_path)
     config["sklearn"]["environment"] = requirements
     config["sklearn"]["task"] = task
+    config["sklearn"]["model_format"] = model_format
 
     if "tabular" in task:
         config["sklearn"]["example_input"] = _get_example_input(data)
@@ -251,6 +275,11 @@ def init(
         "text-regression",
     ],
     data,
+    model_format: Literal[  # type: ignore
+        "skops",
+        "pickle",
+        "auto",
+    ] = "auto",
 ) -> None:
     """Initialize a scikit-learn based Hugging Face repo.
 
@@ -291,6 +320,10 @@ def init(
         :class:`numpy.ndarray`. If ``task`` is ``"text-classification"`` or
         ``"text-regression"``, the data needs to be a ``list`` of strings.
 
+    model_format: str
+        The format the model was persisted in. Can be ``"auto"``, ``"skops"``
+        or ``"pickle"``. Defaults to ``"auto"`` that relies on file extension.
+
     Returns
     -------
     None
@@ -318,6 +351,7 @@ def init(
             dst=dst,
             task=task,
             data=data,
+            model_format=model_format,
         )
     except Exception:
         shutil.rmtree(dst)

diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py
@@ -36,6 +36,7 @@
     _validate_folder,
 )
 from skops.hub_utils.tests.common import HF_HUB_TOKEN
+from skops.io import dump
 from skops.utils.fixes import metadata, path_unlink
 
 iris = load_iris(as_frame=True, return_X_y=False)
@@ -75,39 +76,67 @@ def get_regressor():
 
 
 @pytest.fixture(scope="session")
-def classifier_pickle(repo_path):
-    # Create a simple pickle file for the purpose of testing
+def classifier(repo_path, config_json):
+    # Create a simple model file for the purpose of testing
     clf = get_classifier()
-    path = repo_path / "model.pickle"
+    config_path, file_format = config_json
+    model_file = CONFIG[file_format]["sklearn"]["model"]["file"]
+    path = repo_path / model_file
 
     try:
-        with open(path, "wb") as f:
-            pickle.dump(clf, f)
+        if file_format == "pickle":
+            with open(path, "wb") as f:
+                pickle.dump(clf, f)
+        elif file_format == "skops":
+            dump(clf, path)
         yield path
     finally:
         path_unlink(path, missing_ok=True)
 
 
 CONFIG = {
-    "sklearn": {
-        "environment": ['scikit-learn="1.1.1"'],
-        "model": {"file": "model.pickle"},
-    }
+    "pickle": {
+        "sklearn": {
+            "environment": ['scikit-learn="1.1.1"'],
+            "model": {"file": "model.pickle"},
+        }
+    },
+    "skops": {
+        "sklearn": {
+            "environment": ['scikit-learn="1.1.1"'],
+            "model": {"file": "model.skops"},
+        }
+    },
 }
 
 
-@pytest.fixture(scope="session")
-def config_json(repo_path):
+@pytest.fixture(scope="session", params=["skops", "pickle"])
+def config_json(repo_path, request):
     path = repo_path / "config.json"
     try:
         with open(path, "w") as f:
-            json.dump(CONFIG, f)
-        yield path
+            json.dump(CONFIG[request.param], f)
+        yield path, request.param
     finally:
         path_unlink(path, missing_ok=True)
 
 
+def test_validate_format(classifier):
+    dir_path = tempfile.mkdtemp()
+    shutil.rmtree(dir_path)
+    with pytest.raises(ValueError, match="Cannot determine the input file*"):
+        init(
+            model=classifier,
+            requirements=["scikit-learn"],
+            dst=dir_path,
+            task="tabular-classification",
+            data=iris.data,
+            model_format="dummy",
+        )
+
+
 def test_validate_folder(config_json):
+    config_path, file_format = config_json
     _, file_path = tempfile.mkstemp()
     dir_path = tempfile.mkdtemp()
     with pytest.raises(TypeError, match="The given path is not a directory."):
@@ -124,11 +153,12 @@ def test_validate_folder(config_json):
     ):
         _validate_folder(path=dir_path)
 
-    shutil.copy2(config_json, dir_path)
-    with pytest.raises(TypeError, match="Model file model.pickle does not exist."):
+    shutil.copy2(config_path, dir_path)
+    model_file = CONFIG[file_format]["sklearn"]["model"]["file"]
+    with pytest.raises(TypeError, match=f"Model file {model_file} does not exist."):
         _validate_folder(path=dir_path)
 
-    (Path(dir_path) / "model.pickle").touch()
+    (Path(dir_path) / model_file).touch()
 
     # this should now work w/o an error
     _validate_folder(path=dir_path)
@@ -210,11 +240,11 @@ def test_create_config_invalid_text_data(temp_path):
         )
 
 
-def test_atomic_init(classifier_pickle, temp_path):
+def test_atomic_init(classifier, temp_path):
     with pytest.raises(ValueError):
         # this fails since we're passing an invalid task.
         init(
-            model=classifier_pickle,
+            model=classifier,
             requirements=["scikit-learn"],
             dst=temp_path,
             task="tabular-classification",
@@ -224,35 +254,36 @@ def test_atomic_init(classifier_pickle, temp_path):
     # this passes even though the above init has failed once, on the same
     # destination path.
     init(
-        model=classifier_pickle,
+        model=classifier,
         requirements=["scikit-learn"],
         dst=temp_path,
         task="tabular-classification",
         data=iris.data,
     )
 
 
-def test_init_invalid_task(classifier_pickle, temp_path):
+def test_init_invalid_task(classifier, temp_path):
     with pytest.raises(
         ValueError, match="Task invalid not supported. Supported tasks are"
     ):
         init(
-            model=classifier_pickle,
+            model=classifier,
             requirements=["scikit-learn"],
             dst=temp_path,
             task="invalid",
             data=iris.data,
         )
 
 
-def test_init(classifier_pickle, config_json):
+def test_init(classifier, config_json):
+    config_path, file_format = config_json
     # create a temp directory and delete it, we just need a unique name.
     dir_path = tempfile.mkdtemp()
     shutil.rmtree(dir_path)
 
     version = metadata.version("scikit-learn")
     init(
-        model=classifier_pickle,
+        model=classifier,
         requirements=[f'scikit-learn="{version}"'],
         dst=dir_path,
         task="tabular-classification",
@@ -263,15 +294,16 @@ def test_init(classifier_pickle, config_json):
     # it should fail a second time since the folder is no longer empty.
     with pytest.raises(OSError, match="None-empty dst path already exists!"):
         init(
-            model=classifier_pickle,
+            model=classifier,
             requirements=[f'scikit-learn="{version}"'],
             dst=dir_path,
             task="tabular-classification",
             data=iris.data,
         )
 
 
-def test_init_no_warning_or_error(classifier_pickle, config_json):
+def test_init_no_warning_or_error(classifier, config_json):
+    config_path, file_format = config_json
     # for the happy path, there should be no warning
     dir_path = tempfile.mkdtemp()
     shutil.rmtree(dir_path)
@@ -280,7 +312,7 @@ def test_init_no_warning_or_error(classifier_pickle, config_json):
     with warnings.catch_warnings():
         warnings.simplefilter("error")
         init(
-            model=classifier_pickle,
+            model=classifier,
             requirements=[f'scikit-learn="{version}"'],
             dst=dir_path,
             task="tabular-classification",
@@ -289,6 +321,7 @@ def test_init_no_warning_or_error(classifier_pickle, config_json):
 
 
 def test_model_file_does_not_exist_raises(repo_path, config_json):
+    config_path, file_format = config_json
     # when the model file does not exist, raise an OSError
     model_path = repo_path / "foobar.pickle"
     dir_path = tempfile.mkdtemp()
@@ -308,6 +341,7 @@ def test_model_file_does_not_exist_raises(repo_path, config_json):
 
 
 def test_init_empty_model_file_errors(repo_path, config_json):
+    config_path, file_format = config_json
     # when model file is empty, warn users
     model_path = Path(repo_path / "foobar.pickle")
     model_path.touch()
@@ -336,14 +370,15 @@ def test_push_download(
     explicit_create,
     repo_path,
     destination_path,
-    classifier_pickle,
+    classifier,
     config_json,
 ):
+    config_path, file_format = config_json
     client = HfApi()
 
     version = metadata.version("scikit-learn")
     init(
-        model=classifier_pickle,
+        model=classifier,
         requirements=[f'scikit-learn="{version}"'],
         dst=destination_path,
         task="tabular-classification",
@@ -373,7 +408,7 @@ def test_push_download(
         download(repo_id=repo_id, dst=destination_path, token=HF_HUB_TOKEN)
 
     files = client.list_repo_files(repo_id=repo_id, use_auth_token=HF_HUB_TOKEN)
-    for f_name in [classifier_pickle.name, config_json.name]:
+    for f_name in [classifier.name, config_path.name]:
         assert f_name in files
 
     try:
@@ -408,6 +443,7 @@ def repo_path_for_inference():
     ids=["classifier", "regressor"],
 )
 def test_inference(
+    config_json,
     model_func,
     data,
     task,
@@ -416,11 +452,17 @@ def test_inference(
 ):
     # test inference backend for classifier and regressor models. This test can
     # take a lot of time and be flaky.
-    client = HfApi()
+    config_path, file_format = config_json
+    if file_format != "pickle":
+        pytest.skip(
+            f"Inference only supports pickle at the moment. Given format: {file_format}"
+        )
 
+    client = HfApi()
     repo_path = repo_path_for_inference
+    model_file = CONFIG[file_format]["sklearn"]["model"]["file"]
     model = model_func()
-    model_path = repo_path / "model.pickle"
+    model_path = repo_path / model_file
 
     with open(model_path, "wb") as f:
         pickle.dump(model, f)
@@ -434,7 +476,7 @@ def test_inference(
         data=data.data,
     )
 
-    # a model card is needed for inference engine to work.
+    # TODO: remove when card init at repo init is merged
     model_card = card.Card(
         model, metadata=card.metadata_from_config(Path(destination_path))
     )
@@ -464,9 +506,11 @@ def test_inference(
     assert np.allclose(output, y_pred)
 
 
-def test_get_config(repo_path):
+def test_get_config(repo_path, config_json):
+    config_path, file_format = config_json
     config = get_config(repo_path)
-    assert config == CONFIG
+
+    assert config == CONFIG[file_format]
     assert get_requirements(repo_path) == ['scikit-learn="1.1.1"']
 
 
@@ -528,14 +572,14 @@ def test_get_column_names_pandas_not_installed(pandas_not_installed):
 
 class TestAddFiles:
     @pytest.fixture
-    def init_path(self, classifier_pickle, config_json):
+    def init_path(self, classifier, config_json):
         # create temporary directory
         dir_path = tempfile.mkdtemp()
         shutil.rmtree(dir_path)
 
         version = metadata.version("scikit-learn")
         init(
-            model=classifier_pickle,
+            model=classifier,
             requirements=[f'scikit-learn="{version}"'],
             dst=dir_path,
             task="tabular-classification",