From f9d6f398cd48e48bdc1e5fc78cd11c999fb79ba9 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 17 Jan 2023 14:42:06 +0100 Subject: [PATCH 1/4] Work on supporting scikit-learn intelex Partially solves #251 This is one part of the work required to solve the mentioned issue. The other part will have to be added on the API inference side, once this PR is finalized. Description Added the option use_intelex to hub_utils.init. It adds a new entry to config.json which can later be used by the inference API to decide whether to run it with intelex or not. On top of that, if use_intelex=True, scikit-learn-intelex will be added as a dependency to the requirements if not already there. Moreover, if a metadata for a model card is loaded, a scikit-learn-intelex tag will be added. --- skops/card/_model_card.py | 3 ++ skops/card/tests/test_card.py | 32 +++++++++++++++++++ skops/hub_utils/_hf_hub.py | 33 ++++++++++++++----- skops/hub_utils/tests/test_hf_hub.py | 48 ++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 8 deletions(-) diff --git a/skops/card/_model_card.py b/skops/card/_model_card.py index 74fa377a..4a6f7e25 100644 --- a/skops/card/_model_card.py +++ b/skops/card/_model_card.py @@ -154,6 +154,9 @@ def metadata_from_config(config_path: Union[str, Path]) -> ModelCardData: if task: card_data.tags += [task] card_data.model_file = config.get("sklearn", {}).get("model", {}).get("file") # type: ignore + if config.get("sklearn", {}).get("use_intelex"): + card_data.tags.append("scikit-learn-intelex") + example_input = config.get("sklearn", {}).get("example_input", None) # Documentation on what the widget expects: # https://huggingface.co/docs/hub/models-widgets-examples diff --git a/skops/card/tests/test_card.py b/skops/card/tests/test_card.py index 92e1c68c..91b8ade0 100644 --- a/skops/card/tests/test_card.py +++ b/skops/card/tests/test_card.py @@ -898,6 +898,38 @@ def test_metadata_from_config_tabular_data( for tag in ["sklearn", "skops", "tabular-classification"]: assert tag in metadata["tags"] + def test_metadata_tags_without_sklearn_intelex_tag( + self, destination_path, iris_data, iris_pkl_file + ): + # by default, intelex is not being used + X, _ = iris_data + hub_utils.init( + model=iris_pkl_file, + requirements=[], + dst=destination_path, + task="tabular-classification", + data=X, + ) + + metadata = metadata_from_config(destination_path) + assert "scikit-learn-intelex" not in metadata.tags + + def test_metadata_tags_with_sklearn_intelex_tag( + self, destination_path, iris_data, iris_pkl_file + ): + X, _ = iris_data + hub_utils.init( + model=iris_pkl_file, + requirements=[], + dst=destination_path, + task="tabular-classification", + data=X, + use_intelex=True, + ) + + metadata = metadata_from_config(destination_path) + assert "scikit-learn-intelex" in metadata.tags + @pytest.mark.xfail(reason="dynamic adjustment when model changes not implemented yet") class TestModelDynamicUpdate: diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index d76e1de3..25c2cfab 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -154,6 +154,7 @@ def _create_config( "pickle", "auto", ] = "auto", + use_intelex: bool = False, ) -> None: """Write the configuration into a ``config.json`` file. @@ -188,16 +189,19 @@ def _create_config( The first 3 input values are used as example inputs. - model_format: str + model_format: str (default="auto") The format used to persist the model. Can be ``"auto"``, ``"skops"`` or ``"pickle"``. Defaults to ``"auto"``, which would mean: - ``"pickle"`` if the extension is one of ``{".pickle", ".pkl", ".joblib"}`` - ``"skops"`` if the extension is ``".skops"`` - Returns - ------- - None + use_intelex: bool (default=False) + Whether to enable scikit-learn-intelex. This can accelerate some sklearn + models by a large factor with the right hardware. Enabling this option + should not break any code, even if the model was not initially trained + with scikit-learn intelex and even if the hardware does not support it. + For more info, see https://intel.github.io/scikit-learn-intelex/. """ # so that we don't have to explicitly add keys and they're added as a # dictionary if they are not found @@ -216,11 +220,13 @@ def recursively_default_dict() -> MutableMapping: "Cannot determine the input file format. Please indicate the format using" " the `model_format` argument." ) + config = recursively_default_dict() config["sklearn"]["model"]["file"] = str(model_path) config["sklearn"]["environment"] = requirements config["sklearn"]["task"] = task config["sklearn"]["model_format"] = model_format + config["sklearn"]["use_intelex"] = use_intelex if "tabular" in task: config["sklearn"]["example_input"] = _get_example_input(data) @@ -278,6 +284,7 @@ def init( "pickle", "auto", ] = "auto", + use_intelex: bool = False, ) -> None: """Initialize a scikit-learn based Hugging Face repo. @@ -318,13 +325,16 @@ def init( :class:`numpy.ndarray`. If ``task`` is ``"text-classification"`` or ``"text-regression"``, the data needs to be a ``list`` of strings. - model_format: str + model_format: str (default="auto") The format the model was persisted in. Can be ``"auto"``, ``"skops"`` or ``"pickle"``. Defaults to ``"auto"`` that relies on file extension. - Returns - ------- - None + use_intelex: bool (default=False) + Whether to enable scikit-learn-intelex. This can accelerate some sklearn + models by a large factor with the right hardware. Enabling this option + should not break any code, even if the model was not initially trained + with scikit-learn intelex and even if the hardware does not support it. + For more info, see https://intel.github.io/scikit-learn-intelex/. """ dst = Path(dst) if dst.exists() and bool(next(dst.iterdir(), None)): @@ -339,6 +349,12 @@ def init( dst.mkdir(parents=True, exist_ok=True) + # add intelex requirement, if it's used and not already in requirements + if use_intelex and not any( + r.startswith("scikit-learn-intelex") for r in requirements + ): + requirements.append("scikit-learn-intelex") + try: shutil.copy2(src=model, dst=dst) @@ -350,6 +366,7 @@ def init( task=task, data=data, model_format=model_format, + use_intelex=use_intelex, ) except Exception: shutil.rmtree(dst) diff --git a/skops/hub_utils/tests/test_hf_hub.py b/skops/hub_utils/tests/test_hf_hub.py index c4db9aee..397e2ed5 100644 --- a/skops/hub_utils/tests/test_hf_hub.py +++ b/skops/hub_utils/tests/test_hf_hub.py @@ -658,3 +658,51 @@ def test_adding_existing_file_raises(self, init_path, some_file_0): ) with pytest.raises(FileExistsError, match=msg): add_files(some_file_0, dst=init_path) + + +class TestUseIntelex: + # Tests related to the usage of scikit-learn intelex, see #251 + def make_config(self, model, requirements, **kwargs): + dir_path = tempfile.mkdtemp() + shutil.rmtree(dir_path) + + init( + model=model, + dst=dir_path, + task="tabular-classification", + data=iris.data, + requirements=requirements, + **kwargs, + ) + config = get_config(dir_path) + return config + + def test_no_intelex(self, classifier): + # by default, intelex is not being used + config = self.make_config(model=classifier, requirements=["foobar"]) + environement = config["sklearn"]["environment"] + + assert config["sklearn"]["use_intelex"] is False + assert not any(r.startswith("scikit-learn-intelex") for r in environement) + + def test_use_intelex_but_not_explicitly_in_requirements(self, classifier): + # when using intelex, if it's not explicitly in the environment, add it + # automatically + config = self.make_config( + model=classifier, requirements=["foobar"], use_intelex=True + ) + environement = config["sklearn"]["environment"] + + assert config["sklearn"]["use_intelex"] is True + assert any(r == "scikit-learn-intelex" for r in environement) + + def test_use_intelex_explicitly_in_requirements(self, classifier): + # when users specify intelex explicitly, it's not added automatically to + # the requirements + reqs = ["foobar", "scikit-learn-intelex==2023.0.0"] + config = self.make_config(model=classifier, requirements=reqs, use_intelex=True) + environement = config["sklearn"]["environment"] + + assert config["sklearn"]["use_intelex"] is True + assert not any(r == "scikit-learn-intelex" for r in environement) + assert any(r == "scikit-learn-intelex==2023.0.0" for r in environement) From 8f3f1944f23449b6810b5fee25c18e9c39192faa Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Tue, 17 Jan 2023 14:52:05 +0100 Subject: [PATCH 2/4] Add entry to changes.rst --- docs/changes.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/changes.rst b/docs/changes.rst index f1aa7739..2209c4e0 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -26,6 +26,10 @@ v0.4 section/New section": "content"})`` to add "content" a new subsection called "New section" to an existing section called "Existing section". :pr:`203` by `Benjamin Bossan`_. +- Add an option `use_intelex` to :func:`skops.hub_utils.init` which, when + enabled, will result in the Hugging Face inference API running with Intel's + scikit-learn intelex library, which can accelerate inference times. :pr:`267` + by `Benjamin Bossan`_. v0.3 ---- From 8188dc27f008d6961abf0addf0eb1bb58fd09da3 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 20 Jan 2023 10:54:35 +0100 Subject: [PATCH 3/4] Move changes entry to correct version --- docs/changes.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index 601be495..6240b4de 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -13,6 +13,10 @@ v0.5 ---- - Support more array-like data types for tabular data and list-like data types for text data. :pr:`179` by `Francesco Cariaggi`_. +- Add an option `use_intelex` to :func:`skops.hub_utils.init` which, when + enabled, will result in the Hugging Face inference API running with Intel's + scikit-learn intelex library, which can accelerate inference times. :pr:`267` + by `Benjamin Bossan`_. v0.4 ---- @@ -31,10 +35,6 @@ v0.4 section/New section": "content"})`` to add "content" a new subsection called "New section" to an existing section called "Existing section". :pr:`203` by `Benjamin Bossan`_. -- Add an option `use_intelex` to :func:`skops.hub_utils.init` which, when - enabled, will result in the Hugging Face inference API running with Intel's - scikit-learn intelex library, which can accelerate inference times. :pr:`267` - by `Benjamin Bossan`_. v0.3 ---- From 738ed41e807c14e50465084311c61165b09aa146 Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Fri, 20 Jan 2023 11:08:37 +0100 Subject: [PATCH 4/4] Reviewer comment: Improve docstring on use_intelex --- skops/hub_utils/_hf_hub.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/skops/hub_utils/_hf_hub.py b/skops/hub_utils/_hf_hub.py index 9d6ad33a..1b32759f 100644 --- a/skops/hub_utils/_hf_hub.py +++ b/skops/hub_utils/_hf_hub.py @@ -257,11 +257,12 @@ def _create_config( - ``"skops"`` if the extension is ``".skops"`` use_intelex: bool (default=False) - Whether to enable scikit-learn-intelex. This can accelerate some sklearn - models by a large factor with the right hardware. Enabling this option - should not break any code, even if the model was not initially trained - with scikit-learn intelex and even if the hardware does not support it. - For more info, see https://intel.github.io/scikit-learn-intelex/. + Whether to enable ``scikit-learn-intelex``. This can accelerate some + sklearn models by a large factor with the right hardware. In most cases, + enabling this option should not break any code, even if the model was + not initially trained with scikit-learn intelex and even if the hardware + does not support it. For more info, see + https://intel.github.io/scikit-learn-intelex/. """ # so that we don't have to explicitly add keys and they're added as a # dictionary if they are not found @@ -387,11 +388,12 @@ def init( or ``"pickle"``. Defaults to ``"auto"`` that relies on file extension. use_intelex: bool (default=False) - Whether to enable scikit-learn-intelex. This can accelerate some sklearn - models by a large factor with the right hardware. Enabling this option - should not break any code, even if the model was not initially trained - with scikit-learn intelex and even if the hardware does not support it. - For more info, see https://intel.github.io/scikit-learn-intelex/. + Whether to enable ``scikit-learn-intelex``. This can accelerate some + sklearn models by a large factor with the right hardware. In most cases, + enabling this option should not break any code, even if the model was + not initially trained with scikit-learn intelex and even if the hardware + does not support it. For more info, see + https://intel.github.io/scikit-learn-intelex/. """ dst = Path(dst) if dst.exists() and bool(next(dst.iterdir(), None)):