diff --git a/docs/changes.rst b/docs/changes.rst index 4e21b8ea..6240b4de 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -13,6 +13,10 @@ v0.5 ---- - Support more array-like data types for tabular data and list-like data types for text data. :pr:`179` by `Francesco Cariaggi`_. +- Add an option `use_intelex` to :func:`skops.hub_utils.init` which, when + enabled, will result in the Hugging Face inference API running with Intel's + scikit-learn intelex library, which can accelerate inference times. :pr:`267` + by `Benjamin Bossan`_. v0.4 ---- diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index 33bef3be..0e180336 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -155,6 +155,9 @@ def metadata_from_config(config_path: Union[str, Path]) -> ModelCardData: if task: card_data.tags += [task] card_data.model_file = config.get("sklearn", {}).get("model", {}).get("file") # type: ignore + if config.get("sklearn", {}).get("use_intelex"): + card_data.tags.append("scikit-learn-intelex") + example_input = config.get("sklearn", {}).get("example_input", None) # Documentation on what the widget expects: # https://huggingface.co/docs/hub/models-widgets-examples diff --git a/skops/card/tests/test_card.py b/skops/card/tests/test_card.py index df00a8c6..28f28f90 100644 --- a/skops/card/tests/test_card.py +++ b/skops/card/tests/test_card.py @@ -910,6 +910,38 @@ def test_metadata_model_format_skops( metadata = metadata_load(local_path=Path(destination_path) / "README.md") assert metadata["model_format"] == "skops" + def test_metadata_tags_without_sklearn_intelex_tag( + self, destination_path, iris_data, iris_pkl_file + ): + # by default, intelex is not being used + X, _ = iris_data + hub_utils.init( + model=iris_pkl_file, + requirements=[], + dst=destination_path, + task="tabular-classification", + data=X, + ) + + metadata = metadata_from_config(destination_path) + assert "scikit-learn-intelex" not in metadata.tags + + def test_metadata_tags_with_sklearn_intelex_tag( + self, destination_path, iris_data, iris_pkl_file + ): + X, _ = iris_data + hub_utils.init( + model=iris_pkl_file, + requirements=[], + dst=destination_path, + task="tabular-classification", + data=X, + use_intelex=True, + ) + + metadata = metadata_from_config(destination_path) + assert "scikit-learn-intelex" in metadata.tags + @pytest.mark.xfail(reason="dynamic adjustment when model changes not implemented yet") class TestModelDynamicUpdate: diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index f91a6664..1b32759f 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -211,6 +211,7 @@ def _create_config( "pickle", "auto", ] = "auto", + use_intelex: bool = False, ) -> None: """Write the configuration into a ``config.json`` file. @@ -248,16 +249,20 @@ def _create_config( expected to be an array-like. Otherwise, it is expected to be an sequence of strings. - model_format: str + model_format: str (default="auto") The format used to persist the model. Can be ``"auto"``, ``"skops"`` or ``"pickle"``. Defaults to ``"auto"``, which would mean: - ``"pickle"`` if the extension is one of ``{".pickle", ".pkl", ".joblib"}`` - ``"skops"`` if the extension is ``".skops"`` - Returns - ------- - None + use_intelex: bool (default=False) + Whether to enable ``scikit-learn-intelex``. This can accelerate some + sklearn models by a large factor with the right hardware. In most cases, + enabling this option should not break any code, even if the model was + not initially trained with scikit-learn intelex and even if the hardware + does not support it. For more info, see + https://intel.github.io/scikit-learn-intelex/. """ # so that we don't have to explicitly add keys and they're added as a # dictionary if they are not found @@ -276,11 +281,13 @@ def recursively_default_dict() -> MutableMapping: "Cannot determine the input file format. Please indicate the format using" " the `model_format` argument." ) + config = recursively_default_dict() config["sklearn"]["model"]["file"] = str(model_path) config["sklearn"]["environment"] = requirements config["sklearn"]["task"] = task config["sklearn"]["model_format"] = model_format + config["sklearn"]["use_intelex"] = use_intelex if "tabular" in task: config["sklearn"]["example_input"] = _get_example_input_from_tabular_data(data) @@ -335,6 +342,7 @@ def init( "pickle", "auto", ] = "auto", + use_intelex: bool = False, ) -> None: """Initialize a scikit-learn based Hugging Face repo. @@ -375,13 +383,17 @@ def init( :class:`numpy.ndarray`. If ``task`` is ``"text-classification"`` or ``"text-regression"``, the data needs to be a ``list`` of strings. - model_format: str + model_format: str (default="auto") The format the model was persisted in. Can be ``"auto"``, ``"skops"`` or ``"pickle"``. Defaults to ``"auto"`` that relies on file extension. - Returns - ------- - None + use_intelex: bool (default=False) + Whether to enable ``scikit-learn-intelex``. This can accelerate some + sklearn models by a large factor with the right hardware. In most cases, + enabling this option should not break any code, even if the model was + not initially trained with scikit-learn intelex and even if the hardware + does not support it. For more info, see + https://intel.github.io/scikit-learn-intelex/. """ dst = Path(dst) if dst.exists() and bool(next(dst.iterdir(), None)): @@ -396,6 +408,12 @@ def init( dst.mkdir(parents=True, exist_ok=True) + # add intelex requirement, if it's used and not already in requirements + if use_intelex and not any( + r.startswith("scikit-learn-intelex") for r in requirements + ): + requirements.append("scikit-learn-intelex") + try: shutil.copy2(src=model, dst=dst) @@ -407,6 +425,7 @@ def init( task=task, data=data, model_format=model_format, + use_intelex=use_intelex, ) except Exception: shutil.rmtree(dst) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index cc0511f0..d373dd16 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -697,3 +697,51 @@ def test_adding_existing_file_raises(self, init_path, some_file_0): ) with pytest.raises(FileExistsError, match=msg): add_files(some_file_0, dst=init_path) + + +class TestUseIntelex: + # Tests related to the usage of scikit-learn intelex, see #251 + def make_config(self, model, requirements, **kwargs): + dir_path = tempfile.mkdtemp() + shutil.rmtree(dir_path) + + init( + model=model, + dst=dir_path, + task="tabular-classification", + data=iris.data, + requirements=requirements, + **kwargs, + ) + config = get_config(dir_path) + return config + + def test_no_intelex(self, classifier): + # by default, intelex is not being used + config = self.make_config(model=classifier, requirements=["foobar"]) + environement = config["sklearn"]["environment"] + + assert config["sklearn"]["use_intelex"] is False + assert not any(r.startswith("scikit-learn-intelex") for r in environement) + + def test_use_intelex_but_not_explicitly_in_requirements(self, classifier): + # when using intelex, if it's not explicitly in the environment, add it + # automatically + config = self.make_config( + model=classifier, requirements=["foobar"], use_intelex=True + ) + environement = config["sklearn"]["environment"] + + assert config["sklearn"]["use_intelex"] is True + assert any(r == "scikit-learn-intelex" for r in environement) + + def test_use_intelex_explicitly_in_requirements(self, classifier): + # when users specify intelex explicitly, it's not added automatically to + # the requirements + reqs = ["foobar", "scikit-learn-intelex==2023.0.0"] + config = self.make_config(model=classifier, requirements=reqs, use_intelex=True) + environement = config["sklearn"]["environment"] + + assert config["sklearn"]["use_intelex"] is True + assert not any(r == "scikit-learn-intelex" for r in environement) + assert any(r == "scikit-learn-intelex==2023.0.0" for r in environement)